From f4d2af1376b986433436a4f9d89bb58c738590c6 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Mon, 6 Mar 2023 18:43:03 +0100
Subject: [PATCH 01/12] bbr2

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/tcp.h                |    3 +-
 include/net/inet_connection_sock.h |    3 +-
 include/net/tcp.h                  |   41 +-
 include/uapi/linux/inet_diag.h     |   33 +
 net/ipv4/Kconfig                   |   22 +
 net/ipv4/Makefile                  |    1 +
 net/ipv4/tcp.c                     |    1 +
 net/ipv4/tcp_bbr.c                 |   38 +-
 net/ipv4/tcp_bbr2.c                | 2674 ++++++++++++++++++++++++++++
 net/ipv4/tcp_cong.c                |    1 +
 net/ipv4/tcp_input.c               |   27 +-
 net/ipv4/tcp_output.c              |   26 +-
 net/ipv4/tcp_rate.c                |   30 +-
 net/ipv4/tcp_timer.c               |    1 +
 14 files changed, 2867 insertions(+), 34 deletions(-)
 create mode 100644 net/ipv4/tcp_bbr2.c

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ca7f05a130d2..09dbcd67ee8e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -255,7 +255,8 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		fast_ack_mode:2, /* which fast ack mode ? */
+		unused:3;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c2b15f7e5516..d85858efa571 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -135,7 +135,8 @@ struct inet_connection_sock {
 	u32			  icsk_probes_tstamp;
 	u32			  icsk_user_timeout;
 
-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+/* XXX inflated by temporary internal debugging info */
+	u64			  icsk_ca_priv[216 / sizeof(u64)];
 #define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 99c74fc30083..bff771487a7e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
 #define	TCP_ECN_SEEN		8
+#define	TCP_ECN_ECT_PERMANENT	16
 
 enum tcp_tw_status {
 	TCP_TW_SUCCESS = 0,
@@ -824,6 +825,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 	return max_t(s64, t1 - t0, 0);
 }
 
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+	return max_t(s32, t1 - t0, 0);
+}
+
 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 {
 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
@@ -899,9 +905,14 @@ struct tcp_skb_cb {
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
-			u64 first_tx_mstamp;
+			u32 first_tx_mstamp;
 			/* when we reached the "delivered" count */
-			u64 delivered_mstamp;
+			u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+			u32 in_flight:20,   /* packets in flight at transmit */
+			    unused2:12;
+			u32 lost;	/* packets lost so far upon tx of skb */
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -1027,7 +1038,11 @@ enum tcp_ca_ack_event_flags {
 #define TCP_CONG_NON_RESTRICTED 0x1
 /* Requires ECN/ECT set on all packets */
 #define TCP_CONG_NEEDS_ECN	0x2
-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS	0x4
+#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
+			 TCP_CONG_NEEDS_ECN | \
+			 TCP_CONG_WANTS_CE_EVENTS)
 
 union tcp_cc_info;
 
@@ -1047,8 +1062,11 @@ struct ack_sample {
  */
 struct rate_sample {
 	u64  prior_mstamp; /* starting timestamp for interval */
+	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+	u32 tx_in_flight;	/* packets in flight at starting timestamp */
+	s32  lost;		/* number of packets lost over interval */
 	s32  delivered;		/* number of packets delivered over interval */
 	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
 	long interval_us;	/* time for tp->delivered to incr "delivered" */
@@ -1062,6 +1080,7 @@ struct rate_sample {
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
+	bool is_ece;		/* did this ACK have ECN marked? */
 };
 
 struct tcp_congestion_ops {
@@ -1085,8 +1104,11 @@ struct tcp_congestion_ops {
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
 
-	/* override sysctl_tcp_min_tso_segs */
-	u32 (*min_tso_segs)(struct sock *sk);
+	/* pick target number of segments per TSO/GSO skb (optional): */
+	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
+
+	/* react to a specific lost skb (optional) */
+	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
 
 	/* call when packets are delivered to update cwnd and pacing rate,
 	 * after all the ca_state processing. (optional)
@@ -1149,6 +1171,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
 }
 #endif
 
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+					   TCP_CONG_WANTS_CE_EVENTS);
+}
+
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1168,6 +1198,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
 
 /* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..0e24f11627d5 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -231,9 +231,42 @@ struct tcp_bbr_info {
 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
 };
 
+/* Phase as reported in netlink/ss stats. */
+enum tcp_bbr2_phase {
+	BBR2_PHASE_INVALID		= 0,
+	BBR2_PHASE_STARTUP		= 1,
+	BBR2_PHASE_DRAIN		= 2,
+	BBR2_PHASE_PROBE_RTT		= 3,
+	BBR2_PHASE_PROBE_BW_UP		= 4,
+	BBR2_PHASE_PROBE_BW_DOWN	= 5,
+	BBR2_PHASE_PROBE_BW_CRUISE	= 6,
+	BBR2_PHASE_PROBE_BW_REFILL	= 7
+};
+
+struct tcp_bbr2_info {
+	/* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
+	__u32	bbr_bw_lsb;		/* lower 32 bits of bw */
+	__u32	bbr_bw_msb;		/* upper 32 bits of bw */
+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
+	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
+	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
+	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
+	__u8	bbr_mode;		/* current bbr_mode in state machine */
+	__u8	bbr_phase;		/* current state machine phase */
+	__u8	unused1;		/* alignment padding; not used yet */
+	__u8	bbr_version;		/* MUST be at this offset in struct */
+	__u32	bbr_inflight_lo;	/* lower/short-term data volume bound */
+	__u32	bbr_inflight_hi;	/* higher/long-term data volume bound */
+	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
 	struct tcp_bbr_info	bbr;
+	struct tcp_bbr2_info	bbr2;
 };
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 2dfb12230f08..b6bec331a82e 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -678,6 +678,24 @@ config TCP_CONG_BBR
 	  AQM schemes that do not provide a delay signal. It requires the fq
 	  ("Fair Queue") pacing packet scheduler.
 
+config TCP_CONG_BBR2
+	tristate "BBR2 TCP"
+	default n
+	help
+
+	BBR2 TCP congestion control is a model-based congestion control
+	algorithm that aims to maximize network utilization, keep queues and
+	retransmit rates low, and to be able to coexist with Reno/CUBIC in
+	common scenarios. It builds an explicit model of the network path.  It
+	tolerates a targeted degree of random packet loss and delay that are
+	unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
+	or cable modem links, and can use DCTCP-L4S-style ECN signals.  It can
+	coexist with flows that use loss-based congestion control, and can
+	operate with shallow buffers, deep buffers, bufferbloat, policers, or
+	AQM schemes that do not provide a delay signal. It requires pacing,
+	using either TCP internal pacing or the fq ("Fair Queue") pacing packet
+	scheduler.
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -715,6 +733,9 @@ choice
 	config DEFAULT_BBR
 		bool "BBR" if TCP_CONG_BBR=y
 
+	config DEFAULT_BBR2
+		bool "BBR2" if TCP_CONG_BBR2=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG
 	default "dctcp" if DEFAULT_DCTCP
 	default "cdg" if DEFAULT_CDG
 	default "bbr" if DEFAULT_BBR
+	default "bbr2" if DEFAULT_BBR2
 	default "cubic"
 
 config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 880277c9fd07..ef1da49d20a6 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6bb8eb803105..eb9da0796161 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3191,6 +3191,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->rx_opt.dsack = 0;
 	tp->rx_opt.num_sacks = 0;
 	tp->rcv_ooopack = 0;
+	tp->fast_ack_mode = 0;
 
 
 	/* Clean up fastopen related fields */
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 146792cd26fe..16038f6ee52a 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 		sk->sk_pacing_rate = rate;
 }
 
-/* override sysctl_tcp_min_tso_segs */
 __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
 {
 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+				u32 gso_max_size)
+{
+	u32 segs;
+	u64 bytes;
+
+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+
+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
+	return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
 static u32 bbr_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 segs, bytes;
-
-	/* Sort of tcp_tso_autosize() but ignoring
-	 * driver provided sk_gso_max_size.
-	 */
-	bytes = min_t(unsigned long,
-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
-	return min(segs, 0x7FU);
+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
 }
 
 /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
 	.undo_cwnd	= bbr_undo_cwnd,
 	.cwnd_event	= bbr_cwnd_event,
 	.ssthresh	= bbr_ssthresh,
-	.min_tso_segs	= bbr_min_tso_segs,
+	.tso_segs	= bbr_tso_segs,
 	.get_info	= bbr_get_info,
 	.set_state	= bbr_set_state,
 };
diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
new file mode 100644
index 000000000000..85f8052144d1
--- /dev/null
+++ b/net/ipv4/tcp_bbr2.c
@@ -0,0 +1,2674 @@
+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
+ *
+ * BBRv2 is a model-based congestion control algorithm that aims for low
+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
+ * of the network path, it uses measurements of bandwidth and RTT, as well as
+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals.  Note that
+ * although it can use ECN or loss signals explicitly, it does not require
+ * either; it can bound its in-flight data based on its estimate of the BDP.
+ *
+ * The model has both higher and lower bounds for the operating range:
+ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
+ *   hi: bw_hi, inflight_hi: robust long-term upper bound
+ * The bandwidth-probing time scale is (a) extended dynamically based on
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
+ * an interactive wall-clock time-scale to be more scalable and responsive
+ * than Reno and CUBIC.
+ *
+ * Here is a state transition diagram for BBR:
+ *
+ *             |
+ *             V
+ *    +---> STARTUP  ----+
+ *    |        |         |
+ *    |        V         |
+ *    |      DRAIN   ----+
+ *    |        |         |
+ *    |        V         |
+ *    +---> PROBE_BW ----+
+ *    |      ^    |      |
+ *    |      |    |      |
+ *    |      +----+      |
+ *    |                  |
+ *    +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
+ * BBR is described in detail in:
+ *   "BBR: Congestion-Based Congestion Control",
+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ *   https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+
+#include "tcp_dctcp.h"
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+#define FLAG_DEBUG_VERBOSE	0x1	/* Verbose debugging messages */
+#define FLAG_DEBUG_LOOPBACK	0x2	/* Do NOT skip loopback addr */
+
+#define CYCLE_LEN		8	/* number of phases in a pacing gain cycle */
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
+	BBR_DRAIN,	/* drain any queue created during startup */
+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
+	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+};
+
+/* How does the incoming ACK stream relate to our bandwidth probing? */
+enum bbr_ack_phase {
+	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
+	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
+	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
+	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
+	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
+};
+
+/* BBR congestion control block */
+struct bbr {
+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+	u32	probe_rtt_min_us;	/* min RTT in bbr_probe_rtt_win_ms window */
+	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
+	u64	cycle_mstamp;	     /* time of this cycle phase start */
+	u32     mode:3,		     /* current bbr_mode in state machine */
+		prev_ca_state:3,     /* CA state on previous ACK */
+		packet_conservation:1,  /* use packet conservation? */
+		round_start:1,	     /* start of packet-timed tx->ack round? */
+		ce_state:1,          /* If most recent data has CE bit set */
+		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
+		try_fast_path:1, 	/* can we take fast path? */
+		unused2:11,
+		idle_restart:1,	     /* restarting after idle? */
+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		has_seen_rtt:1;	     /* have we seen an RTT sample yet? */
+	u32	pacing_gain:11,	/* current gain for setting pacing rate */
+		cwnd_gain:11,	/* current gain for setting cwnd */
+		full_bw_reached:1,   /* reached full bw in Startup? */
+		full_bw_cnt:2,	/* number of rounds without large bw gains */
+		init_cwnd:7;	/* initial cwnd */
+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+
+	/* For tracking ACK aggregation: */
+	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
+	u16	extra_acked[2];		/* max excess data ACKed in epoch */
+	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+		extra_acked_win_idx:1,	/* current index in extra_acked array */
+	/* BBR v2 state: */
+		unused1:2,
+		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
+		loss_in_cycle:1,	/* packet loss in this cycle? */
+		ecn_in_cycle:1;		/* ECN in this cycle? */
+	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
+	u32	undo_bw_lo;	     /* bw_lo before latest losses */
+	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
+	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
+	u32	bw_latest;	 /* max delivered bw in last round trip */
+	u32	bw_lo;		 /* lower bound on sending bandwidth */
+	u32	bw_hi[2];	 /* upper bound of sending bandwidth range*/
+	u32	inflight_latest; /* max delivered data in last round trip */
+	u32	inflight_lo;	 /* lower bound of inflight data range */
+	u32	inflight_hi;	 /* upper bound of inflight data range */
+	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
+	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
+	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
+	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
+		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
+		bw_probe_samples:1,    /* rate samples reflect bw probing? */
+		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
+		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
+		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
+		loss_round_start:1,    /* loss_round_delivered round trip? */
+		loss_in_round:1,       /* loss marked in this round trip? */
+		ecn_in_round:1,	       /* ECN marked in this round trip? */
+		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
+		loss_events_in_round:4,/* losses in STARTUP round */
+		initialized:1;	       /* has bbr_init() been called? */
+	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
+	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+
+	/* Params configurable using setsockopt. Refer to correspoding
+	 * module param for detailed description of params.
+	 */
+	struct bbr_params {
+		u32	high_gain:11,		/* max allowed value: 2047 */
+			drain_gain:10,		/* max allowed value: 1023 */
+			cwnd_gain:11;		/* max allowed value: 2047 */
+		u32	cwnd_min_target:4,	/* max allowed value: 15 */
+			min_rtt_win_sec:5,	/* max allowed value: 31 */
+			probe_rtt_mode_ms:9,	/* max allowed value: 511 */
+			full_bw_cnt:3,		/* max allowed value: 7 */
+			cwnd_tso_budget:1,	/* allowed values: {0, 1} */
+			unused3:6,
+			drain_to_target:1,	/* boolean */
+			precise_ece_ack:1,	/* boolean */
+			extra_acked_in_startup:1, /* allowed values: {0, 1} */
+			fast_path:1;		/* boolean */
+		u32	full_bw_thresh:10,	/* max allowed value: 1023 */
+			startup_cwnd_gain:11,	/* max allowed value: 2047 */
+			bw_probe_pif_gain:9,	/* max allowed value: 511 */
+			usage_based_cwnd:1, 	/* boolean */
+			unused2:1;
+		u16	probe_rtt_win_ms:14,	/* max allowed value: 16383 */
+			refill_add_inc:2;	/* max allowed value: 3 */
+		u16	extra_acked_gain:11,	/* max allowed value: 2047 */
+			extra_acked_win_rtts:5; /* max allowed value: 31*/
+		u16	pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
+		/* Mostly BBR v2 parameters below here: */
+		u32	ecn_alpha_gain:8,	/* max allowed value: 255 */
+			ecn_factor:8,		/* max allowed value: 255 */
+			ecn_thresh:8,		/* max allowed value: 255 */
+			beta:8;			/* max allowed value: 255 */
+		u32	ecn_max_rtt_us:19,	/* max allowed value: 524287 */
+			bw_probe_reno_gain:9,	/* max allowed value: 511 */
+			full_loss_cnt:4;	/* max allowed value: 15 */
+		u32	probe_rtt_cwnd_gain:8,	/* max allowed value: 255 */
+			inflight_headroom:8,	/* max allowed value: 255 */
+			loss_thresh:8,		/* max allowed value: 255 */
+			bw_probe_max_rounds:8;	/* max allowed value: 255 */
+		u32	bw_probe_rand_rounds:4, /* max allowed value: 15 */
+			bw_probe_base_us:26,	/* usecs: 0..2^26-1 (67 secs) */
+			full_ecn_cnt:2;		/* max allowed value: 3 */
+		u32	bw_probe_rand_us:26,	/* usecs: 0..2^26-1 (67 secs) */
+			undo:1,			/* boolean */
+			tso_rtt_shift:4,	/* max allowed value: 15 */
+			unused5:1;
+		u32	ecn_reprobe_gain:9,	/* max allowed value: 511 */
+			unused1:14,
+			ecn_alpha_init:9;	/* max allowed value: 256 */
+	} params;
+
+	struct {
+		u32	snd_isn; /* Initial sequence number */
+		u32	rs_bw; 	 /* last valid rate sample bw */
+		u32	target_cwnd; /* target cwnd, based on BDP */
+		u8	undo:1,  /* Undo even happened but not yet logged */
+			unused:7;
+		char	event;	 /* single-letter event debug codes */
+		u16	unused2;
+	} debug;
+};
+
+struct bbr_context {
+	u32 sample_bw;
+	u32 target_cwnd;
+	u32 log:1;
+};
+
+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
+static u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
+ * Max allowed value is 511 (0x1FF).
+ */
+static u32 bbr_probe_rtt_mode_ms = 200;
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
+ * typical interval between PROBE_RTT mode entries.
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
+ */
+static u32 bbr_probe_rtt_win_ms = 5000;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static int bbr_min_tso_rate = 1200000;
+
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static u32 bbr_tso_rtt_shift = 9;  /* halve allowance per 2^9 usecs, 512us */
+
+/* Select cwnd TSO budget approach:
+ *  0: padding
+ *  1: flooring
+ */
+static uint bbr_cwnd_tso_budget = 1;
+
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
+ */
+static int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
+static int bbr_startup_cwnd_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round. Max allowed value
+ * is 1023 (0x3FF).
+ */
+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
+ * Max allowed value is 2047 (0x7FF).
+ */
+static int bbr_cwnd_gain  = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
+ * Max allowed value for each element is 1023 (0x3FF).
+ */
+enum bbr_pacing_gain_phase {
+	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
+	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
+	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
+	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+};
+static int bbr_pacing_gain[] = {
+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+};
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
+ */
+static u32 bbr_cwnd_min_target = 4;
+
+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
+ * Use 0 to disable. Max allowed value is 255.
+ */
+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available.
+ * Max allowed value is 1023 (0x3FF).
+ */
+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
+ * Max allowed value is 7 (0x7).
+ */
+static u32 bbr_full_bw_cnt = 3;
+
+static u32 bbr_flags;		/* Debugging related stuff */
+
+/* Whether to debug using printk.
+ */
+static bool bbr_debug_with_printk;
+
+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
+ * Ignored when bbr_debug_with_printk is set.
+ */
+static bool bbr_debug_ftrace;
+
+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
+static bool bbr_drain_to_target = true;		/* default: enabled */
+
+/* Experiment: Flags to control BBR with ECN behavior.
+ */
+static bool bbr_precise_ece_ack = true;		/* default: enabled */
+
+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
+ */
+static u32 bbr_cwnd_warn_val	= 1U << 20;
+
+static u16 bbr_debug_port_mask;
+
+/* BBR module parameters. These are module parameters only in Google prod.
+ * Upstream these are intentionally not module parameters.
+ */
+static int bbr_pacing_gain_size = CYCLE_LEN;
+
+/* Gain factor for adding extra_acked to target cwnd: */
+static int bbr_extra_acked_gain = 256;
+
+/* Window length of extra_acked window. Max allowed val is 31. */
+static u32 bbr_extra_acked_win_rtts = 5;
+
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+
+/* Time period for clamping cwnd increment due to ack aggregation */
+static u32 bbr_extra_acked_max_us = 100 * 1000;
+
+/* Use extra acked in startup ?
+ * 0: disabled
+ * 1: use latest extra_acked value from 1-2 rtt in startup
+ */
+static int bbr_extra_acked_in_startup = 1;		/* default: enabled */
+
+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
+static bool bbr_usage_based_cwnd;		/* default: disabled */
+
+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
+ * when they know that any ECN marks that the connections experience will be
+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
+ * negotiation or configuration that is outside the scope of the BBRv2
+ * alpha release.
+ */
+static bool bbr_ecn_enable = false;
+
+module_param_named(min_tso_rate,      bbr_min_tso_rate,      int,    0644);
+module_param_named(tso_rtt_shift,     bbr_tso_rtt_shift,     int,    0644);
+module_param_named(high_gain,         bbr_high_gain,         int,    0644);
+module_param_named(drain_gain,        bbr_drain_gain,        int,    0644);
+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int,    0644);
+module_param_named(cwnd_gain,         bbr_cwnd_gain,         int,    0644);
+module_param_array_named(pacing_gain, bbr_pacing_gain,       int,
+			 &bbr_pacing_gain_size, 0644);
+module_param_named(cwnd_min_target,   bbr_cwnd_min_target,   uint,   0644);
+module_param_named(probe_rtt_cwnd_gain,
+		   bbr_probe_rtt_cwnd_gain,		     uint,   0664);
+module_param_named(cwnd_warn_val,     bbr_cwnd_warn_val,     uint,   0664);
+module_param_named(debug_port_mask,   bbr_debug_port_mask,   ushort, 0644);
+module_param_named(flags,             bbr_flags,             uint,   0644);
+module_param_named(debug_ftrace,      bbr_debug_ftrace, bool,   0644);
+module_param_named(debug_with_printk, bbr_debug_with_printk, bool,   0644);
+module_param_named(min_rtt_win_sec,   bbr_min_rtt_win_sec,   uint,   0644);
+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint,   0644);
+module_param_named(probe_rtt_win_ms,  bbr_probe_rtt_win_ms,  uint,   0644);
+module_param_named(full_bw_thresh,    bbr_full_bw_thresh,    uint,   0644);
+module_param_named(full_bw_cnt,       bbr_full_bw_cnt,       uint,   0644);
+module_param_named(cwnd_tso_bduget,   bbr_cwnd_tso_budget,   uint,   0664);
+module_param_named(extra_acked_gain,  bbr_extra_acked_gain,  int,    0664);
+module_param_named(extra_acked_win_rtts,
+		   bbr_extra_acked_win_rtts, uint,   0664);
+module_param_named(extra_acked_max_us,
+		   bbr_extra_acked_max_us, uint,   0664);
+module_param_named(ack_epoch_acked_reset_thresh,
+		   bbr_ack_epoch_acked_reset_thresh, uint,   0664);
+module_param_named(drain_to_target,   bbr_drain_to_target,   bool,   0664);
+module_param_named(precise_ece_ack,   bbr_precise_ece_ack,   bool,   0664);
+module_param_named(extra_acked_in_startup,
+		   bbr_extra_acked_in_startup, int, 0664);
+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool,   0664);
+module_param_named(ecn_enable,       bbr_ecn_enable,         bool,   0664);
+
+static void bbr2_exit_probe_rtt(struct sock *sk);
+static void bbr2_reset_congestion_signals(struct sock *sk);
+
+static void bbr_check_probe_rtt_done(struct sock *sk);
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->full_bw_reached;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return min(bbr_max_bw(sk), bbr->bw_lo);
+}
+
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
+				  int margin)
+{
+	unsigned int mss = tcp_sk(sk)->mss_cache;
+
+	rate *= mss;
+	rate *= gain;
+	rate >>= BBR_SCALE;
+	rate *= USEC_PER_SEC / 100 * (100 - margin);
+	rate >>= BW_SCALE;
+	rate = max(rate, 1ULL);
+	return rate;
+}
+
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
+{
+	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+}
+
+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
+{
+	rate = bbr_bw_bytes_per_sec(sk, rate);
+	rate *= 8;
+	do_div(rate, 1000);
+	return rate;
+}
+
+static u32 bbr_tso_segs_goal(struct sock *sk);
+static void bbr_debug(struct sock *sk, u32 acked,
+		      const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	static const char ca_states[] = {
+		[TCP_CA_Open]		= 'O',
+		[TCP_CA_Disorder]	= 'D',
+		[TCP_CA_CWR]		= 'C',
+		[TCP_CA_Recovery]	= 'R',
+		[TCP_CA_Loss]		= 'L',
+	};
+	static const char mode[] = {
+		'G',  /* Growing   - BBR_STARTUP */
+		'D',  /* Drain     - BBR_DRAIN */
+		'W',  /* Window    - BBR_PROBE_BW */
+		'M',  /* Min RTT   - BBR_PROBE_RTT */
+	};
+	static const char ack_phase[] = { /* bbr_ack_phase strings */
+		'I',	/* BBR_ACKS_INIT	   - 'Init' */
+		'R',	/* BBR_ACKS_REFILLING	   - 'Refilling' */
+		'B',	/* BBR_ACKS_PROBE_STARTING - 'Before' */
+		'F',	/* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
+		'A',	/* BBR_ACKS_PROBE_STOPPING - 'After' */
+	};
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	const u32 una = tp->snd_una - bbr->debug.snd_isn;
+	const u32 fack = tcp_highest_sack_seq(tp);
+	const u16 dport = ntohs(inet_sk(sk)->inet_dport);
+	bool is_port_match = (bbr_debug_port_mask &&
+			      ((dport & bbr_debug_port_mask) == 0));
+	char debugmsg[320];
+
+	if (sk->sk_state == TCP_SYN_SENT)
+		return;  /* no bbr_init() yet if SYN retransmit -> CA_Loss */
+
+	if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
+		char addr[INET6_ADDRSTRLEN + 10] = { 0 };
+
+		if (sk->sk_family == AF_INET)
+			snprintf(addr, sizeof(addr), "%pI4:%u",
+				 &inet_sk(sk)->inet_daddr, dport);
+		else if (sk->sk_family == AF_INET6)
+			snprintf(addr, sizeof(addr), "%pI6:%u",
+				 &sk->sk_v6_daddr, dport);
+
+		WARN_ONCE(1,
+			"BBR %s cwnd alert: %u "
+			"snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
+			"bw: %u rtt: %u min_rtt: %u "
+			"acked: %u tso_segs: %u "
+			"bw: %d %ld %d pif: %u\n",
+			addr, tp->snd_cwnd,
+			una, inet_csk(sk)->icsk_ca_state,
+			bbr->pacing_gain, bbr->cwnd_gain,
+			bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
+			acked, bbr_tso_segs_goal(sk),
+			rs->delivered, rs->interval_us, rs->is_retrans,
+			tcp_packets_in_flight(tp));
+	}
+
+	if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
+		return;
+
+	if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
+		return;
+
+	if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
+		return;
+
+	if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
+	    !(bbr_flags & FLAG_DEBUG_LOOPBACK))
+		return;
+
+	snprintf(debugmsg, sizeof(debugmsg) - 1,
+		 "BBR %pI4:%-5u %5u,%03u:%-7u %c "
+		 "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
+		 "bw %llu lb %llu ib %llu qb %llu "
+		 "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
+		 "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
+		 "v %d %c %u %c %s\n",
+		 &inet_sk(sk)->inet_daddr, dport,
+		 una / 1000, una % 1000, fack - tp->snd_una,
+		 ca_states[inet_csk(sk)->icsk_ca_state],
+		 bbr->debug.undo ? '@' : mode[bbr->mode],
+		 tp->snd_cwnd,
+		 bbr_extra_acked(sk),	/* br (legacy): extra_acked */
+		 rs->tx_in_flight,	/* cr (legacy): tx_inflight */
+		 rs->rtt_us,
+		 rs->delivered,
+		 rs->interval_us,
+		 bbr->min_rtt_us,
+		 rs->is_app_limited ? '_' : 'l',
+		 bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
+		 bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
+		 0ULL,				    /* lb: [obsolete] */
+		 0ULL,				    /* ib: [obsolete] */
+		 div_u64((u64)sk->sk_pacing_rate * 8, 1000),
+		 acked,
+		 tcp_packets_in_flight(tp),
+		 rs->is_ack_delayed ? 'd' : '.',
+		 bbr->round_start ? '*' : '.',
+		 tp->delivered, tp->lost,
+		 tp->app_limited,
+		 0,			    	    /* #: [obsolete] */
+		 ctx->target_cwnd,
+		 tp->reord_seen ? 'r' : '.',  /* r: reordering seen? */
+		 ca_states[bbr->prev_ca_state],
+		 (rs->lost + rs->delivered) > 0 ?
+		 (1000 * rs->lost /
+		  (rs->lost + rs->delivered)) : 0,    /* lr: loss rate x1000 */
+		 (rs->delivered) > 0 ?
+		 (1000 * rs->delivered_ce /
+		  (rs->delivered)) : 0,		      /* er: ECN rate x1000 */
+		 1000 * bbr->ecn_alpha >> BBR_SCALE,  /* ea: ECN alpha x1000 */
+		 bbr->bw_lo == ~0U ?
+		   -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
+		 bbr->inflight_lo,	/* il */
+		 bbr->inflight_hi,	/* ih */
+		 bbr->bw_probe_up_cnt,	/* c */
+		 2,			/* v: version */
+		 bbr->debug.event,
+		 bbr->cycle_idx,
+		 ack_phase[bbr->ack_phase],
+		 bbr->bw_probe_samples ? "Y" : "N");
+	debugmsg[sizeof(debugmsg) - 1] = 0;
+
+	/* printk takes a higher precedence. */
+	if (bbr_debug_with_printk)
+		printk(KERN_DEBUG "%s", debugmsg);
+
+	if (unlikely(bbr->debug.undo))
+		bbr->debug.undo = 0;
+}
+
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
+				      bbr_pacing_margin_percent);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	return rate;
+}
+
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+	u32 rtt_us;
+
+	if (tp->srtt_us) {		/* any RTT sample yet? */
+		rtt_us = max(tp->srtt_us >> 3, 1U);
+		bbr->has_seen_rtt = 1;
+	} else {			 /* no RTT sample yet */
+		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
+	}
+	bw = (u64)tp->snd_cwnd * BW_UNIT;
+	do_div(bw, rtt_us);
+	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
+}
+
+/* Pace using current bw estimate and a gain factor. */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+
+	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
+		bbr_init_pacing_rate_from_rtt(sk);
+	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
+		sk->sk_pacing_rate = rate;
+}
+
+static u32 bbr_min_tso_segs(struct sock *sk)
+{
+	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+}
+
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+				u32 gso_max_size)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 segs, r;
+	u64 bytes;
+
+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+
+	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
+	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
+	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
+	 */
+	if (bbr->params.tso_rtt_shift) {
+		r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
+		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
+			bytes += GSO_MAX_SIZE >> r;
+	}
+
+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
+	return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (event == CA_EVENT_TX_START && tp->app_limited) {
+		bbr->idle_restart = 1;
+		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+		bbr->ack_epoch_acked = 0;
+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
+		 * need more speed (we're restarting from idle and app-limited).
+		 */
+		if (bbr->mode == BBR_PROBE_BW)
+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+		else if (bbr->mode == BBR_PROBE_RTT)
+			bbr_check_probe_rtt_done(sk);
+	} else if ((event == CA_EVENT_ECN_IS_CE ||
+		    event == CA_EVENT_ECN_NO_CE) &&
+		    bbr_ecn_enable &&
+		    bbr->params.precise_ece_ack) {
+		u32 state = bbr->ce_state;
+		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
+		bbr->ce_state = state;
+		if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+	}
+}
+
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
+ *
+ * bdp = ceil(bw * min_rtt * gain)
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ */
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bdp;
+	u64 w;
+
+	/* If we've never had a valid RTT sample, cap cwnd at the initial
+	 * default. This should only happen when the connection is not using TCP
+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+	 * case we need to slow-start up toward something safe: initial cwnd.
+	 */
+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+
+	w = (u64)bw * bbr->min_rtt_us;
+
+	/* Apply a gain to the given value, remove the BW_SCALE shift, and
+	 * round the value up to avoid a negative feedback loop.
+	 */
+	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+	return bdp;
+}
+
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ *   - one skb in sending host Qdisc,
+ *   - one skb in sending host TSO/GSO engine
+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 tso_segs_goal;
+
+	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	if (bbr->params.cwnd_tso_budget == 1) {
+		cwnd = max_t(u32, cwnd, tso_segs_goal);
+		cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
+	} else {
+		cwnd += tso_segs_goal;
+		cwnd = (cwnd + 1) & ~1U;
+	}
+	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+		cwnd += 2;
+
+	return cwnd;
+}
+
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
+{
+	u32 inflight;
+
+	inflight = bbr_bdp(sk, bw, gain);
+	inflight = bbr_quantization_budget(sk, inflight);
+
+	return inflight;
+}
+
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 now_ns, edt_ns, interval_us;
+	u32 interval_delivered, inflight_at_edt;
+
+	now_ns = tp->tcp_clock_cache;
+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+	inflight_at_edt = inflight_now;
+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
+	if (interval_delivered >= inflight_at_edt)
+		return 0;
+	return inflight_at_edt - interval_delivered;
+}
+
+/* Find the cwnd increment based on estimate of ack aggregation */
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 max_aggr_cwnd, aggr_cwnd = 0;
+
+	if (bbr->params.extra_acked_gain &&
+	    (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
+		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+				/ BW_UNIT;
+		aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
+			     >> BBR_SCALE;
+		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+	}
+
+	return aggr_cwnd;
+}
+
+/* Returns the cwnd for PROBE_RTT mode. */
+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->params.probe_rtt_cwnd_gain == 0)
+		return bbr->params.cwnd_min_target;
+	return max_t(u32, bbr->params.cwnd_min_target,
+		     bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+			 u32 acked, u32 bw, int gain, u32 cwnd,
+			 struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
+
+	if (!acked)
+		goto done;  /* no packet fully ACKed; just apply caps */
+
+	target_cwnd = bbr_bdp(sk, bw, gain);
+
+	/* Increment the cwnd to account for excess ACKed data that seems
+	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
+	 */
+	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+
+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+	bbr->debug.target_cwnd = target_cwnd;
+
+	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
+	bbr->try_fast_path = 0;
+	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
+		cwnd += acked;
+		if (cwnd >= target_cwnd) {
+			cwnd = target_cwnd;
+			bbr->try_fast_path = 1;
+		}
+	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
+		cwnd += acked;
+	} else {
+		bbr->try_fast_path = 1;
+	}
+
+	/* When growing cwnd, don't grow beyond twice what we just probed. */
+	if (bbr->params.usage_based_cwnd) {
+		max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
+		cwnd = min(cwnd, max_probe);
+	}
+
+	cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
+done:
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+		tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
+
+	ctx->target_cwnd = target_cwnd;
+	ctx->log = (tp->snd_cwnd != prev_cwnd);
+}
+
+/* See if we have reached next round trip */
+static void bbr_update_round_start(struct sock *sk,
+		const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->round_start = 0;
+
+	/* See if we've reached the next RTT */
+	if (rs->interval_us > 0 &&
+	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		bbr->next_rtt_delivered = tp->delivered;
+		bbr->round_start = 1;
+	}
+}
+
+/* Calculate the bandwidth based on how fast packets are delivered */
+static void bbr_calculate_bw_sample(struct sock *sk,
+			const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw = 0;
+
+	/* Divide delivered by the interval to find a (lower bound) bottleneck
+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 * Round up to allow growth at low rates, even with integer division.
+	 */
+	if (rs->interval_us > 0) {
+		if (WARN_ONCE(rs->delivered < 0,
+			      "negative delivered: %d interval_us: %ld\n",
+			      rs->delivered, rs->interval_us))
+			return;
+
+		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+	}
+
+	ctx->sample_bw = bw;
+	bbr->debug.rs_bw = bw;
+}
+
+/* Estimates the windowed max degree of ack aggregation.
+ * This is used to provision extra in-flight data to keep sending during
+ * inter-ACK silences.
+ *
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
+ *
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
+ * cwnd += max_extra_acked
+ *
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ * trips for non-startup phase, and 1-2 round trips for startup.
+ */
+static void bbr_update_ack_aggregation(struct sock *sk,
+				       const struct rate_sample *rs)
+{
+	u32 epoch_us, expected_acked, extra_acked;
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
+
+	if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
+	    rs->delivered < 0 || rs->interval_us <= 0)
+		return;
+
+	if (bbr->round_start) {
+		bbr->extra_acked_win_rtts = min(0x1F,
+						bbr->extra_acked_win_rtts + 1);
+		if (bbr->params.extra_acked_in_startup &&
+		    !bbr_full_bw_reached(sk))
+			extra_acked_win_rtts_thresh = 1;
+		if (bbr->extra_acked_win_rtts >=
+		    extra_acked_win_rtts_thresh) {
+			bbr->extra_acked_win_rtts = 0;
+			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+						   0 : 1;
+			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
+		}
+	}
+
+	/* Compute how many packets we expected to be delivered over epoch. */
+	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
+				      bbr->ack_epoch_mstamp);
+	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
+
+	/* Reset the aggregation epoch if ACK rate is below expected rate or
+	 * significantly large no. of ack received since epoch (potentially
+	 * quite old epoch).
+	 */
+	if (bbr->ack_epoch_acked <= expected_acked ||
+	    (bbr->ack_epoch_acked + rs->acked_sacked >=
+	     bbr_ack_epoch_acked_reset_thresh)) {
+		bbr->ack_epoch_acked = 0;
+		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
+		expected_acked = 0;
+	}
+
+	/* Compute excess data delivered, beyond what was expected. */
+	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
+				   bbr->ack_epoch_acked + rs->acked_sacked);
+	extra_acked = bbr->ack_epoch_acked - expected_acked;
+	extra_acked = min(extra_acked, tp->snd_cwnd);
+	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
+		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh;
+
+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+		return;
+
+	bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
+	if (bbr_max_bw(sk) >= bw_thresh) {
+		bbr->full_bw = bbr_max_bw(sk);
+		bbr->full_bw_cnt = 0;
+		return;
+	}
+	++bbr->full_bw_cnt;
+	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
+			    struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+		bbr2_reset_congestion_signals(sk);
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+		return true;  /* exiting DRAIN now */
+	return false;
+}
+
+static void bbr_check_probe_rtt_done(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (!(bbr->probe_rtt_done_stamp &&
+	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+		return;
+
+	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+	bbr2_exit_probe_rtt(sk);
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool probe_rtt_expired, min_rtt_expired;
+	u32 expire;
+
+	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
+	expire = bbr->probe_rtt_min_stamp +
+		 msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
+	probe_rtt_expired = after(tcp_jiffies32, expire);
+	if (rs->rtt_us >= 0 &&
+	    (rs->rtt_us <= bbr->probe_rtt_min_us ||
+	     (probe_rtt_expired && !rs->is_ack_delayed))) {
+		bbr->probe_rtt_min_us = rs->rtt_us;
+		bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	}
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
+	min_rtt_expired = after(tcp_jiffies32, expire);
+	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
+	    min_rtt_expired) {
+		bbr->min_rtt_us = bbr->probe_rtt_min_us;
+		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+	}
+
+	if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+		bbr->probe_rtt_done_stamp = 0;
+		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+		bbr->next_rtt_delivered = tp->delivered;
+	}
+
+	if (bbr->mode == BBR_PROBE_RTT) {
+		/* Ignore low rate samples during this mode. */
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+		/* Maintain min packets in flight for max(200 ms, 1 round). */
+		if (!bbr->probe_rtt_done_stamp &&
+		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+				msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
+			bbr->probe_rtt_round_done = 0;
+			bbr->next_rtt_delivered = tp->delivered;
+		} else if (bbr->probe_rtt_done_stamp) {
+			if (bbr->round_start)
+				bbr->probe_rtt_round_done = 1;
+			if (bbr->probe_rtt_round_done)
+				bbr_check_probe_rtt_done(sk);
+		}
+	}
+	/* Restart after idle ends only once we process a new S/ACK for data */
+	if (rs->delivered > 0)
+		bbr->idle_restart = 0;
+}
+
+static void bbr_update_gains(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		bbr->pacing_gain = bbr->params.high_gain;
+		bbr->cwnd_gain	 = bbr->params.startup_cwnd_gain;
+		break;
+	case BBR_DRAIN:
+		bbr->pacing_gain = bbr->params.drain_gain;  /* slow, to drain */
+		bbr->cwnd_gain = bbr->params.startup_cwnd_gain;  /* keep cwnd */
+		break;
+	case BBR_PROBE_BW:
+		bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
+		bbr->cwnd_gain = bbr->params.cwnd_gain;
+		break;
+	case BBR_PROBE_RTT:
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain = BBR_UNIT;
+		break;
+	default:
+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+		break;
+	}
+}
+
+static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	int i;
+
+	WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
+
+	bbr->initialized = 1;
+	bbr->params.high_gain = min(0x7FF, bbr_high_gain);
+	bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
+	bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
+	bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
+	bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
+	bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
+	bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
+	bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
+	bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
+	bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
+	bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
+	bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
+	bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
+	bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
+	bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
+	bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
+	bbr->params.probe_rtt_win_ms =
+		min(0x3FFFU,
+		    min_t(u32, bbr_probe_rtt_win_ms,
+			  bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
+	for (i = 0; i < CYCLE_LEN; i++)
+		bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
+	bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
+	bbr->params.tso_rtt_shift =  min(0xFU, bbr_tso_rtt_shift);
+
+	bbr->debug.snd_isn = tp->snd_una;
+	bbr->debug.target_cwnd = 0;
+	bbr->debug.undo = 0;
+
+	bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
+	bbr->prior_cwnd = tp->prior_cwnd;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	bbr->next_rtt_delivered = 0;
+	bbr->prev_ca_state = TCP_CA_Open;
+	bbr->packet_conservation = 0;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
+	bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_jiffies32;
+
+	bbr->has_seen_rtt = 0;
+	bbr_init_pacing_rate_from_rtt(sk);
+
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw_reached = 0;
+	bbr->full_bw = 0;
+	bbr->full_bw_cnt = 0;
+	bbr->cycle_mstamp = 0;
+	bbr->cycle_idx = 0;
+	bbr->mode = BBR_STARTUP;
+	bbr->debug.rs_bw = 0;
+
+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+	bbr->ack_epoch_acked = 0;
+	bbr->extra_acked_win_rtts = 0;
+	bbr->extra_acked_win_idx = 0;
+	bbr->extra_acked[0] = 0;
+	bbr->extra_acked[1] = 0;
+
+	bbr->ce_state = 0;
+	bbr->prior_rcv_nxt = tp->rcv_nxt;
+	bbr->try_fast_path = 0;
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
+}
+
+/* __________________________________________________________________________
+ *
+ * Functions new to BBR v2 ("bbr") congestion control are below here.
+ * __________________________________________________________________________
+ */
+
+/* Incorporate a new bw sample into the current window of our max filter. */
+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+}
+
+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
+static void bbr2_advance_bw_hi_filter(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (!bbr->bw_hi[1])
+		return;  /* no samples in this window; remember old window */
+	bbr->bw_hi[0] = bbr->bw_hi[1];
+	bbr->bw_hi[1] = 0;
+}
+
+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
+static u32 bbr2_target_inflight(struct sock *sk)
+{
+	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+
+	return min(bdp, tcp_sk(sk)->snd_cwnd);
+}
+
+static bool bbr2_is_probing_bandwidth(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return (bbr->mode == BBR_STARTUP) ||
+		(bbr->mode == BBR_PROBE_BW &&
+		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
+		  bbr->cycle_idx == BBR_BW_PROBE_UP));
+}
+
+/* Has the given amount of time elapsed since we marked the phase start? */
+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return tcp_stamp_us_delta(tp->tcp_mstamp,
+				  bbr->cycle_mstamp + interval_us) > 0;
+}
+
+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->full_bw_reached = 1;
+	bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+}
+
+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
+	    !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
+		return;
+
+	if (ce_ratio >= bbr->params.ecn_thresh)
+		bbr->startup_ecn_rounds++;
+	else
+		bbr->startup_ecn_rounds = 0;
+
+	if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
+		bbr->debug.event = 'E';  /* ECN caused STARTUP exit */
+		bbr2_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+}
+
+static void bbr2_update_ecn_alpha(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	s32 delivered, delivered_ce;
+	u64 alpha, ce_ratio;
+	u32 gain;
+
+	if (bbr->params.ecn_factor == 0)
+		return;
+
+	delivered = tp->delivered - bbr->alpha_last_delivered;
+	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
+
+	if (delivered == 0 ||		/* avoid divide by zero */
+	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
+		return;
+
+	/* See if we should use ECN sender logic for this connection. */
+	if (!bbr->ecn_eligible && bbr_ecn_enable &&
+	    (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
+	     !bbr->params.ecn_max_rtt_us))
+		bbr->ecn_eligible = 1;
+
+	ce_ratio = (u64)delivered_ce << BBR_SCALE;
+	do_div(ce_ratio, delivered);
+	gain = bbr->params.ecn_alpha_gain;
+	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
+	alpha += (gain * ce_ratio) >> BBR_SCALE;
+	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
+
+	bbr->alpha_last_delivered = tp->delivered;
+	bbr->alpha_last_delivered_ce = tp->delivered_ce;
+
+	bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
+}
+
+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
+static void bbr2_raise_inflight_hi_slope(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 growth_this_round, cnt;
+
+	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
+	growth_this_round = 1 << bbr->bw_probe_up_rounds;
+	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
+	cnt = tp->snd_cwnd / growth_this_round;
+	cnt = max(cnt, 1U);
+	bbr->bw_probe_up_cnt = cnt;
+	bbr->debug.event = 'G';  /* Grow inflight_hi slope */
+}
+
+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
+static void bbr2_probe_inflight_hi_upward(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 delta;
+
+	if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
+		bbr->bw_probe_up_acks = 0;  /* don't accmulate unused credits */
+		return;  /* not fully using inflight_hi, so don't grow it */
+	}
+
+	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
+	bbr->bw_probe_up_acks += rs->acked_sacked;
+	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
+		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
+		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
+		bbr->inflight_hi += delta;
+		bbr->debug.event = 'I';  /* Increment inflight_hi */
+	}
+
+	if (bbr->round_start)
+		bbr2_raise_inflight_hi_slope(sk);
+}
+
+/* Does loss/ECN rate for this sample say inflight is "too high"?
+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
+ * uses it to notice when loss/ECN rates suggest inflight is too high.
+ */
+static bool bbr2_is_inflight_too_high(const struct sock *sk,
+				     const struct rate_sample *rs)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+	u32 loss_thresh, ecn_thresh;
+
+	if (rs->lost > 0 && rs->tx_in_flight) {
+		loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
+				BBR_SCALE;
+		if (rs->lost > loss_thresh)
+			return true;
+	}
+
+	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
+	    bbr->ecn_eligible && bbr->params.ecn_thresh) {
+		ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
+				BBR_SCALE;
+		if (rs->delivered_ce >= ecn_thresh)
+			return true;
+	}
+
+	return false;
+}
+
+/* Calculate the tx_in_flight level that corresponded to excessive loss.
+ * We find "lost_prefix" segs of the skb where loss rate went too high,
+ * by solving for "lost_prefix" in the following equation:
+ *   lost                     /  inflight                     >= loss_thresh
+ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
+ * Then we take that equation, convert it to fixed point, and
+ * round up to the nearest packet.
+ */
+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
+					  const struct rate_sample *rs,
+					  const struct sk_buff *skb)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+	u32 loss_thresh  = bbr->params.loss_thresh;
+	u32 pcount, divisor, inflight_hi;
+	s32 inflight_prev, lost_prev;
+	u64 loss_budget, lost_prefix;
+
+	pcount = tcp_skb_pcount(skb);
+
+	/* How much data was in flight before this skb? */
+	inflight_prev = rs->tx_in_flight - pcount;
+	if (WARN_ONCE(inflight_prev < 0,
+		      "tx_in_flight: %u pcount: %u reneg: %u",
+		      rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
+		return ~0U;
+
+	/* How much inflight data was marked lost before this skb? */
+	lost_prev = rs->lost - pcount;
+	if (WARN_ON_ONCE(lost_prev < 0))
+		return ~0U;
+
+	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
+	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
+	loss_budget >>= BBR_SCALE;
+	if (lost_prev >= loss_budget) {
+		lost_prefix = 0;   /* previous losses crossed loss_thresh */
+	} else {
+		lost_prefix = loss_budget - lost_prev;
+		lost_prefix <<= BBR_SCALE;
+		divisor = BBR_UNIT - loss_thresh;
+		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
+			return ~0U;
+		do_div(lost_prefix, divisor);
+	}
+
+	inflight_hi = inflight_prev + lost_prefix;
+	return inflight_hi;
+}
+
+/* If loss/ECN rates during probing indicated we may have overfilled a
+ * buffer, return an operating point that tries to leave unutilized headroom in
+ * the path for other flows, for fairness convergence and lower RTTs and loss.
+ */
+static u32 bbr2_inflight_with_headroom(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 headroom, headroom_fraction;
+
+	if (bbr->inflight_hi == ~0U)
+		return ~0U;
+
+	headroom_fraction = bbr->params.inflight_headroom;
+	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
+	headroom = max(headroom, 1U);
+	return max_t(s32, bbr->inflight_hi - headroom,
+		     bbr->params.cwnd_min_target);
+}
+
+/* Bound cwnd to a sensible level, based on our current probing state
+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
+ */
+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cap;
+
+	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
+	 * and thus cong_control() without first initializing us(!).
+	 */
+	if (!bbr->initialized)
+		return;
+
+	cap = ~0U;
+	if (bbr->mode == BBR_PROBE_BW &&
+	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
+		/* Probe to see if more packets fit in the path. */
+		cap = bbr->inflight_hi;
+	} else {
+		if (bbr->mode == BBR_PROBE_RTT ||
+		    (bbr->mode == BBR_PROBE_BW &&
+		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
+			cap = bbr2_inflight_with_headroom(sk);
+	}
+	/* Adapt to any loss/ECN since our last bw probe. */
+	cap = min(cap, bbr->inflight_lo);
+
+	cap = max_t(u32, cap, bbr->params.cwnd_min_target);
+	tp->snd_cwnd = min(cap, tp->snd_cwnd);
+}
+
+/* Estimate a short-term lower bound on the capacity available now, based
+ * on measurements of the current delivery process and recent history. When we
+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
+ * move toward flow balance by multiplicatively cutting our short-term
+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
+ * multiplicative decrease in order to converge to a lower capacity in time
+ * logarithmic in the magnitude of the decrease.
+ *
+ * However, we do not cut our short-term estimates lower than the current rate
+ * and volume of delivered data from this round trip, since from the current
+ * delivery process we can estimate the measured capacity available now.
+ *
+ * Anything faster than that approach would knowingly risk high loss, which can
+ * cause low bw for Reno/CUBIC and high loss recovery latency for
+ * request/response flows using any congestion control.
+ */
+static void bbr2_adapt_lower_bounds(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 ecn_cut, ecn_inflight_lo, beta;
+
+	/* We only use lower-bound estimates when not probing bw.
+	 * When probing we need to push inflight higher to probe bw.
+	 */
+	if (bbr2_is_probing_bandwidth(sk))
+		return;
+
+	/* ECN response. */
+	if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
+		/* Reduce inflight to (1 - alpha*ecn_factor). */
+		ecn_cut = (BBR_UNIT -
+			   ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
+			    BBR_SCALE));
+		if (bbr->inflight_lo == ~0U)
+			bbr->inflight_lo = tp->snd_cwnd;
+		ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
+	} else {
+		ecn_inflight_lo = ~0U;
+	}
+
+	/* Loss response. */
+	if (bbr->loss_in_round) {
+		/* Reduce bw and inflight to (1 - beta). */
+		if (bbr->bw_lo == ~0U)
+			bbr->bw_lo = bbr_max_bw(sk);
+		if (bbr->inflight_lo == ~0U)
+			bbr->inflight_lo = tp->snd_cwnd;
+		beta = bbr->params.beta;
+		bbr->bw_lo =
+			max_t(u32, bbr->bw_latest,
+			      (u64)bbr->bw_lo *
+			      (BBR_UNIT - beta) >> BBR_SCALE);
+		bbr->inflight_lo =
+			max_t(u32, bbr->inflight_latest,
+			      (u64)bbr->inflight_lo *
+			      (BBR_UNIT - beta) >> BBR_SCALE);
+	}
+
+	/* Adjust to the lower of the levels implied by loss or ECN. */
+	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
+}
+
+/* Reset any short-term lower-bound adaptation to congestion, so that we can
+ * push our inflight up.
+ */
+static void bbr2_reset_lower_bounds(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->bw_lo = ~0U;
+	bbr->inflight_lo = ~0U;
+}
+
+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
+ * machine phase where we adapt our lower bound based on congestion signals.
+ */
+static void bbr2_reset_congestion_signals(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round = 0;
+	bbr->loss_in_cycle = 0;
+	bbr->ecn_in_cycle = 0;
+	bbr->bw_latest = 0;
+	bbr->inflight_latest = 0;
+}
+
+/* Update (most of) our congestion signals: track the recent rate and volume of
+ * delivered data, presence of loss, and EWMA degree of ECN marking.
+ */
+static void bbr2_update_congestion_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->loss_round_start = 0;
+	if (rs->interval_us <= 0 || !rs->acked_sacked)
+		return; /* Not a valid observation */
+	bw = ctx->sample_bw;
+
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
+		bbr2_take_bw_hi_sample(sk, bw);
+
+	bbr->loss_in_round |= (rs->losses > 0);
+
+	/* Update rate and volume of delivered data from latest round trip: */
+	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
+	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
+
+	if (before(rs->prior_delivered, bbr->loss_round_delivered))
+		return;		/* skip the per-round-trip updates */
+	/* Now do per-round-trip updates. */
+	bbr->loss_round_delivered = tp->delivered;  /* mark round trip */
+	bbr->loss_round_start = 1;
+	bbr2_adapt_lower_bounds(sk);
+
+	/* Update windowed "latest" (single-round-trip) filters. */
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round  = 0;
+	bbr->bw_latest = ctx->sample_bw;
+	bbr->inflight_latest = rs->delivered;
+}
+
+/* Bandwidth probing can cause loss. To help coexistence with loss-based
+ * congestion control we spread out our probing in a Reno-conscious way. Due to
+ * the shape of the Reno sawtooth, the time required between loss epochs for an
+ * idealized Reno flow is a number of round trips that is the BDP of that
+ * flow. We count packet-timed round trips directly, since measured RTT can
+ * vary widely, and Reno is driven by packet-timed round trips.
+ */
+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 inflight, rounds, reno_gain, reno_rounds;
+
+	/* Random loss can shave some small percentage off of our inflight
+	 * in each round. To survive this, flows need robust periodic probes.
+	 */
+	rounds = bbr->params.bw_probe_max_rounds;
+
+	reno_gain = bbr->params.bw_probe_reno_gain;
+	if (reno_gain) {
+		inflight = bbr2_target_inflight(sk);
+		reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
+		rounds = min(rounds, reno_rounds);
+	}
+	return bbr->rounds_since_probe >= rounds;
+}
+
+/* How long do we want to wait before probing for bandwidth (and risking
+ * loss)? We randomize the wait, for better mixing and fairness convergence.
+ *
+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
+ * (eg 4K video to a broadband user):
+ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ *
+ * We bound the BBR-native inter-bw-probe wall clock time to be:
+ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
+ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
+ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
+ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
+ *      amount of time to discover unutilized bw on human-scale interactive
+ *      time-scales (e.g. perhaps traffic from a web page download that we
+ *      were competing with is now complete).
+ */
+static void bbr2_pick_probe_wait(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Decide the random round-trip bound for wait until probe: */
+	bbr->rounds_since_probe =
+		get_random_u32_below(bbr->params.bw_probe_rand_rounds);
+	/* Decide the random wall clock bound for wait until probe: */
+	bbr->probe_wait_us = bbr->params.bw_probe_base_us +
+			     get_random_u32_below(bbr->params.bw_probe_rand_us);
+}
+
+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = cycle_idx;
+	/* New phase, so need to update cwnd and pacing rate. */
+	bbr->try_fast_path = 0;
+}
+
+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
+ * before PROBE_UP, because as soon as we send faster than the available bw
+ * we will start building a queue, and if the buffer is shallow we can cause
+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
+ * inflight_hi estimates will underestimate.
+ */
+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr2_reset_lower_bounds(sk);
+	if (bbr->inflight_hi != ~0U)
+		bbr->inflight_hi += bbr->params.refill_add_inc;
+	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
+	bbr->bw_probe_up_acks = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_REFILLING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
+}
+
+/* Now probe max deliverable data rate and volume. */
+static void bbr2_start_bw_probe_up(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->cycle_mstamp = tp->tcp_mstamp;
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
+	bbr2_raise_inflight_hi_slope(sk);
+}
+
+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
+ * clock time at which to probe beyond an inflight that we think to be
+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
+ * if we estimate a Reno flow at our BDP would probe faster.
+ */
+static void bbr2_start_bw_probe_down(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr2_reset_congestion_signals(sk);
+	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
+	bbr2_pick_probe_wait(sk);
+	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
+	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
+}
+
+/* Cruise: maintain what we estimate to be a neutral, conservative
+ * operating point, without attempting to probe up for bandwidth or down for
+ * RTT, and only reducing inflight in response to loss/ECN signals.
+ */
+static void bbr2_start_bw_probe_cruise(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->inflight_lo != ~0U)
+		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
+
+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
+}
+
+/* Loss and/or ECN rate is too high while probing.
+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
+ */
+static void bbr2_handle_inflight_too_high(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	const u32 beta = bbr->params.beta;
+
+	bbr->prev_probe_too_high = 1;
+	bbr->bw_probe_samples = 0;  /* only react once per probe */
+	bbr->debug.event = 'L';     /* Loss/ECN too high */
+	/* If we are app-limited then we are not robustly
+	 * probing the max volume of inflight data we think
+	 * might be safe (analogous to how app-limited bw
+	 * samples are not known to be robustly probing bw).
+	 */
+	if (!rs->is_app_limited)
+		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
+					 (u64)bbr2_target_inflight(sk) *
+					 (BBR_UNIT - beta) >> BBR_SCALE);
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+		bbr2_start_bw_probe_down(sk);
+}
+
+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
+ * inflight_hi downward. If we're able to push inflight higher without such
+ * signals, push higher: adapt inflight_hi upward.
+ */
+static bool bbr2_adapt_upper_bounds(struct sock *sk,
+				   const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Track when we'll see bw/loss samples resulting from our bw probes. */
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
+		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
+		/* End of samples from bw probing phase. */
+		bbr->bw_probe_samples = 0;
+		bbr->ack_phase = BBR_ACKS_INIT;
+		/* At this point in the cycle, our current bw sample is also
+		 * our best recent chance at finding the highest available bw
+		 * for this flow. So now is the best time to forget the bw
+		 * samples from the previous cycle, by advancing the window.
+		 */
+		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
+			bbr2_advance_bw_hi_filter(sk);
+		/* If we had an inflight_hi, then probed and pushed inflight all
+		 * the way up to hit that inflight_hi without seeing any
+		 * high loss/ECN in all the resulting ACKs from that probing,
+		 * then probe up again, this time letting inflight persist at
+		 * inflight_hi for a round trip, then accelerating beyond.
+		 */
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
+			bbr->debug.event = 'R';  /* reprobe */
+			bbr2_start_bw_probe_refill(sk, 0);
+			return true;  /* yes, decided state transition */
+		}
+	}
+
+	if (bbr2_is_inflight_too_high(sk, rs)) {
+		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
+			bbr2_handle_inflight_too_high(sk, rs);
+	} else {
+		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
+		if (bbr->inflight_hi == ~0U)  /* no excess queue signals yet? */
+			return false;
+
+		/* To be resilient to random loss, we must raise inflight_hi
+		 * if we observe in any phase that a higher level is safe.
+		 */
+		if (rs->tx_in_flight > bbr->inflight_hi) {
+			bbr->inflight_hi = rs->tx_in_flight;
+			bbr->debug.event = 'U';  /* raise up inflight_hi */
+		}
+
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
+			bbr2_probe_inflight_hi_upward(sk, rs);
+	}
+
+	return false;
+}
+
+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
+static bool bbr2_check_time_to_probe_bw(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 n;
+
+	/* If we seem to be at an operating point where we are not seeing loss
+	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
+	 * quickly (in case a burst of cross-traffic has ceased and freed up bw,
+	 * or in case we are sharing with multiplicatively probing traffic).
+	 */
+	if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
+	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
+		bbr->debug.event = 'A';  /* *A*ll clear to probe *A*gain */
+		/* Calculate n so that when bbr2_raise_inflight_hi_slope()
+		 * computes growth_this_round as 2^n it will be roughly the
+		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
+		 */
+		n = ilog2((((u64)bbr->inflight_hi *
+			    bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
+		bbr2_start_bw_probe_refill(sk, n);
+		return true;
+	}
+
+	if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
+	    bbr2_is_reno_coexistence_probe_time(sk)) {
+		bbr2_start_bw_probe_refill(sk, 0);
+		return true;
+	}
+	return false;
+}
+
+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_under_bdp, is_long_enough;
+
+	/* Always need to pull inflight down to leave headroom in queue. */
+	if (inflight > bbr2_inflight_with_headroom(sk))
+		return false;
+
+	is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+	if (bbr->params.drain_to_target)
+		return is_under_bdp;
+
+	is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
+	return is_under_bdp || is_long_enough;
+}
+
+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
+static void bbr2_update_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_risky = false, is_queuing = false;
+	u32 inflight, bw;
+
+	if (!bbr_full_bw_reached(sk))
+		return;
+
+	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
+	if (bbr2_adapt_upper_bounds(sk, rs))
+		return;		/* already decided state transition */
+
+	if (bbr->mode != BBR_PROBE_BW)
+		return;
+
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+	bw = bbr_max_bw(sk);
+
+	switch (bbr->cycle_idx) {
+	/* First we spend most of our time cruising with a pacing_gain of 1.0,
+	 * which paces at the estimated bw, to try to fully use the pipe
+	 * without building queue. If we encounter loss/ECN marks, we adapt
+	 * by slowing down.
+	 */
+	case BBR_BW_PROBE_CRUISE:
+		if (bbr2_check_time_to_probe_bw(sk))
+			return;		/* already decided state transition */
+		break;
+
+	/* After cruising, when it's time to probe, we first "refill": we send
+	 * at the estimated bw to fill the pipe, before probing higher and
+	 * knowingly risking overflowing the bottleneck buffer (causing loss).
+	 */
+	case BBR_BW_PROBE_REFILL:
+		if (bbr->round_start) {
+			/* After one full round trip of sending in REFILL, we
+			 * start to see bw samples reflecting our REFILL, which
+			 * may be putting too much data in flight.
+			 */
+			bbr->bw_probe_samples = 1;
+			bbr2_start_bw_probe_up(sk);
+		}
+		break;
+
+	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
+	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
+	 * to at least pacing_gain*BDP; note that this may take more than
+	 * min_rtt if min_rtt is small (e.g. on a LAN).
+	 *
+	 * We terminate PROBE_UP bandwidth probing upon any of the following:
+	 *
+	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
+	 *     most recent previous bw probe phase. Thus we want to start
+	 *     draining the queue immediately because it's very likely the most
+	 *     recently sent packets will fill the queue and cause drops.
+	 *     (checked here)
+	 * (2) We have probed for at least 1*min_rtt_us, and the
+	 *     estimated queue is high enough (inflight > 1.25 * estimated_bdp).
+	 *     (checked here)
+	 * (3) Loss filter says loss rate is "too high".
+	 *     (checked in bbr_is_inflight_too_high())
+	 * (4) ECN filter says ECN mark rate is "too high".
+	 *     (checked in bbr_is_inflight_too_high())
+	 */
+	case BBR_BW_PROBE_UP:
+		if (bbr->prev_probe_too_high &&
+		    inflight >= bbr->inflight_hi) {
+			bbr->stopped_risky_probe = 1;
+			is_risky = true;
+			bbr->debug.event = 'D';   /* D for danger */
+		} else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
+			   inflight >=
+			   bbr_inflight(sk, bw,
+					bbr->params.bw_probe_pif_gain)) {
+			is_queuing = true;
+			bbr->debug.event = 'Q'; /* building Queue */
+		}
+		if (is_risky || is_queuing) {
+			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
+			bbr2_start_bw_probe_down(sk);  /* restart w/ down */
+		}
+		break;
+
+	/* After probing in PROBE_UP, we have usually accumulated some data in
+	 * the bottleneck buffer (if bw probing didn't find more bw). We next
+	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
+	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
+	 * our inflight is less then that target cruising point, which is the
+	 * minimum of (a) the amount needed to leave headroom, and (b) the
+	 * estimated BDP. Once inflight falls to match the target, we estimate
+	 * the queue is drained; persisting would underutilize the pipe.
+	 */
+	case BBR_BW_PROBE_DOWN:
+		if (bbr2_check_time_to_probe_bw(sk))
+			return;		/* already decided state transition */
+		if (bbr2_check_time_to_cruise(sk, inflight, bw))
+			bbr2_start_bw_probe_cruise(sk);
+		break;
+
+	default:
+		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
+	}
+}
+
+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
+static void bbr2_exit_probe_rtt(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr2_reset_lower_bounds(sk);
+	if (bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_PROBE_BW;
+		/* Raising inflight after PROBE_RTT may cause loss, so reset
+		 * the PROBE_BW clock and schedule the next bandwidth probe for
+		 * a friendly and randomized future point in time.
+		 */
+		bbr2_start_bw_probe_down(sk);
+		/* Since we are exiting PROBE_RTT, we know inflight is
+		 * below our estimated BDP, so it is reasonable to cruise.
+		 */
+		bbr2_start_bw_probe_cruise(sk);
+	} else {
+		bbr->mode = BBR_STARTUP;
+	}
+}
+
+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
+ * the end of the round in recovery to get a good estimate of how many packets
+ * have been lost, and how many we need to drain with a low pacing rate.
+ */
+static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
+					       const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk))
+		return;
+
+	/* For STARTUP exit, check the loss rate at the end of each round trip
+	 * of Recovery episodes in STARTUP. We check the loss rate at the end
+	 * of the round trip to filter out noisy/low loss and have a better
+	 * sense of inflight (extent of loss), so we can drain more accurately.
+	 */
+	if (rs->losses && bbr->loss_events_in_round < 0xf)
+		bbr->loss_events_in_round++;  /* update saturating counter */
+	if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
+	    bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
+	    bbr2_is_inflight_too_high(sk, rs)) {
+		bbr->debug.event = 'P';  /* Packet loss caused STARTUP exit */
+		bbr2_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+	if (bbr->loss_round_start)
+		bbr->loss_events_in_round = 0;
+}
+
+/* If we are done draining, advance into steady state operation in PROBE_BW. */
+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
+			     struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_check_drain(sk, rs, ctx)) {
+		bbr->mode = BBR_PROBE_BW;
+		bbr2_start_bw_probe_down(sk);
+	}
+}
+
+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
+			      struct bbr_context *ctx)
+{
+	bbr2_update_congestion_signals(sk, rs, ctx);
+	bbr_update_ack_aggregation(sk, rs);
+	bbr2_check_loss_too_high_in_startup(sk, rs);
+	bbr_check_full_bw_reached(sk, rs);
+	bbr2_check_drain(sk, rs, ctx);
+	bbr2_update_cycle_phase(sk, rs);
+	bbr_update_min_rtt(sk, rs);
+}
+
+/* Fast path for app-limited case.
+ *
+ * On each ack, we execute bbr state machine, which primarily consists of:
+ * 1) update model based on new rate sample, and
+ * 2) update control based on updated model or state change.
+ *
+ * There are certain workload/scenarios, e.g. app-limited case, where
+ * either we can skip updating model or we can skip update of both model
+ * as well as control. This provides signifcant softirq cpu savings for
+ * processing incoming acks.
+ *
+ * In case of app-limited, if there is no congestion (loss/ecn) and
+ * if observed bw sample is less than current estimated bw, then we can
+ * skip some of the computation in bbr state processing:
+ *
+ * - if there is no rtt/mode/phase change: In this case, since all the
+ *   parameters of the network model are constant, we can skip model
+ *   as well control update.
+ *
+ * - else we can skip rest of the model update. But we still need to
+ *   update the control to account for the new rtt/mode/phase.
+ *
+ * Returns whether we can take fast path or not.
+ */
+static bool bbr2_fast_path(struct sock *sk, bool *update_model,
+		const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 prev_min_rtt_us, prev_mode;
+
+	if (bbr->params.fast_path && bbr->try_fast_path &&
+	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
+	    !bbr->loss_in_round && !bbr->ecn_in_round) {
+		prev_mode = bbr->mode;
+		prev_min_rtt_us = bbr->min_rtt_us;
+		bbr2_check_drain(sk, rs, ctx);
+		bbr2_update_cycle_phase(sk, rs);
+		bbr_update_min_rtt(sk, rs);
+
+		if (bbr->mode == prev_mode &&
+		    bbr->min_rtt_us == prev_min_rtt_us &&
+		    bbr->try_fast_path)
+			return true;
+
+		/* Skip model update, but control still needs to be updated */
+		*update_model = false;
+	}
+	return false;
+}
+
+static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct bbr_context ctx = { 0 };
+	bool update_model = true;
+	u32 bw;
+
+	bbr->debug.event = '.';  /* init to default NOP (no event yet) */
+
+	bbr_update_round_start(sk, rs, &ctx);
+	if (bbr->round_start) {
+		bbr->rounds_since_probe =
+			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
+		bbr2_update_ecn_alpha(sk);
+	}
+
+	bbr->ecn_in_round  |= rs->is_ece;
+	bbr_calculate_bw_sample(sk, rs, &ctx);
+
+	if (bbr2_fast_path(sk, &update_model, rs, &ctx))
+		goto out;
+
+	if (update_model)
+		bbr2_update_model(sk, rs, &ctx);
+
+	bbr_update_gains(sk);
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
+		     tp->snd_cwnd, &ctx);
+	bbr2_bound_cwnd_for_inflight_model(sk);
+
+out:
+	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
+	bbr->loss_in_cycle |= rs->lost > 0;
+	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
+
+	bbr_debug(sk, rs->acked_sacked, rs, &ctx);
+}
+
+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
+ * down here, so that the algorithm functions that use the parameters must use
+ * the per-socket parameters; if they accidentally use the global version
+ * then there will be a compile error.
+ * TODO(ncardwell): move all per-socket parameters down to this section.
+ */
+
+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
+ * No loss response when 0. Max allwed value is 255.
+ */
+static u32 bbr_beta = BBR_UNIT * 30 / 100;
+
+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
+ * Max allowed value is 255.
+ */
+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;  /* 1/16 = 6.25% */
+
+/* The initial value for the ecn_alpha state variable. Default and max
+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
+ * to congestion if the bottleneck is congested when the flow starts up.
+ */
+static u32 bbr_ecn_alpha_init = BBR_UNIT;	/* 1.0, to respond quickly */
+
+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
+ * No ECN based bounding when 0. Max allwed value is 255.
+ */
+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	    /* 1/3 = 33% */
+
+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
+ */
+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
+
+/* Max RTT (in usec) at which to use sender-side ECN logic.
+ * Disabled when 0 (ECN allowed at any RTT).
+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
+ */
+static u32 bbr_ecn_max_rtt_us = 5000;
+
+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
+ * clears then use a multiplicative increase to quickly reprobe bw by
+ * starting inflight probing at the given multiple of inflight_hi.
+ * Default for this experimental knob is 0 (disabled).
+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
+ */
+static u32 bbr_ecn_reprobe_gain;
+
+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
+
+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
+ * and loss rate is higher than bbr_loss_thresh.
+ * Disabled if 0. Max allowed value is 15 (0xF).
+ */
+static u32 bbr_full_loss_cnt = 8;
+
+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
+ * meets this count. Max allowed value is 3.
+ */
+static u32 bbr_full_ecn_cnt = 2;
+
+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
+
+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
+ * Default is 1.25x, as in BBR v1. Max allowed is 511.
+ */
+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
+
+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
+ * Max allowed is 511.
+ */
+static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
+
+/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
+ * we want to tolerate 1% random loss per round, and not have this cut our
+ * inflight too much, we must probe for bw periodically on roughly this scale.
+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
+ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ */
+static u32 bbr_bw_probe_max_rounds = 63;
+
+/* Max amount of randomness to inject in round counting for Reno-coexistence.
+ * Max value is 15.
+ */
+static u32 bbr_bw_probe_rand_rounds = 2;
+
+/* Use BBR-native probe time scale starting at this many usec.
+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
+ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
+ */
+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
+
+/* Use BBR-native probes spread over this many usec: */
+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
+
+/* Undo the model changes made in loss recovery if recovery was spurious? */
+static bool bbr_undo = true;
+
+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
+static bool bbr_fast_path = true;	/* default: enabled */
+
+/* Use fast ack mode ? */
+static int bbr_fast_ack_mode = 1;	/* default: rwnd check off */
+
+/* How much to additively increase inflight_hi when entering REFILL? */
+static u32 bbr_refill_add_inc;		/* default: disabled */
+
+module_param_named(beta,                 bbr_beta,                 uint, 0644);
+module_param_named(ecn_alpha_gain,       bbr_ecn_alpha_gain,       uint, 0644);
+module_param_named(ecn_alpha_init,       bbr_ecn_alpha_init,       uint, 0644);
+module_param_named(ecn_factor,           bbr_ecn_factor,           uint, 0644);
+module_param_named(ecn_thresh,           bbr_ecn_thresh,           uint, 0644);
+module_param_named(ecn_max_rtt_us,       bbr_ecn_max_rtt_us,       uint, 0644);
+module_param_named(ecn_reprobe_gain,     bbr_ecn_reprobe_gain,     uint, 0644);
+module_param_named(loss_thresh,          bbr_loss_thresh,          uint, 0664);
+module_param_named(full_loss_cnt,        bbr_full_loss_cnt,        uint, 0664);
+module_param_named(full_ecn_cnt,         bbr_full_ecn_cnt,         uint, 0664);
+module_param_named(inflight_headroom,    bbr_inflight_headroom,    uint, 0664);
+module_param_named(bw_probe_pif_gain,    bbr_bw_probe_pif_gain,    uint, 0664);
+module_param_named(bw_probe_reno_gain,   bbr_bw_probe_reno_gain,   uint, 0664);
+module_param_named(bw_probe_max_rounds,  bbr_bw_probe_max_rounds,  uint, 0664);
+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
+module_param_named(bw_probe_base_us,     bbr_bw_probe_base_us,     uint, 0664);
+module_param_named(bw_probe_rand_us,     bbr_bw_probe_rand_us,     uint, 0664);
+module_param_named(undo,                 bbr_undo,                 bool, 0664);
+module_param_named(fast_path,		 bbr_fast_path,		   bool, 0664);
+module_param_named(fast_ack_mode,	 bbr_fast_ack_mode,	   uint, 0664);
+module_param_named(refill_add_inc,       bbr_refill_add_inc,       uint, 0664);
+
+static void bbr2_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_init(sk);	/* run shared init code for v1 and v2 */
+
+	/* BBR v2 parameters: */
+	bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
+	bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
+	bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
+	bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
+	bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
+	bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
+	bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
+	bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
+	bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
+	bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
+	bbr->params.inflight_headroom =
+		min_t(u32, 0xFFU, bbr_inflight_headroom);
+	bbr->params.bw_probe_pif_gain =
+		min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
+	bbr->params.bw_probe_reno_gain =
+		min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
+	bbr->params.bw_probe_max_rounds =
+		min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
+	bbr->params.bw_probe_rand_rounds =
+		min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
+	bbr->params.bw_probe_base_us =
+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
+	bbr->params.bw_probe_rand_us =
+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
+	bbr->params.undo = bbr_undo;
+	bbr->params.fast_path = bbr_fast_path ? 1 : 0;
+	bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
+
+	/* BBR v2 state: */
+	bbr->initialized = 1;
+	/* Start sampling ECN mark rate after first full flight is ACKed: */
+	bbr->loss_round_delivered = tp->delivered + 1;
+	bbr->loss_round_start = 0;
+	bbr->undo_bw_lo = 0;
+	bbr->undo_inflight_lo = 0;
+	bbr->undo_inflight_hi = 0;
+	bbr->loss_events_in_round = 0;
+	bbr->startup_ecn_rounds = 0;
+	bbr2_reset_congestion_signals(sk);
+	bbr->bw_lo = ~0U;
+	bbr->bw_hi[0] = 0;
+	bbr->bw_hi[1] = 0;
+	bbr->inflight_lo = ~0U;
+	bbr->inflight_hi = ~0U;
+	bbr->bw_probe_up_cnt = ~0U;
+	bbr->bw_probe_up_acks = 0;
+	bbr->bw_probe_up_rounds = 0;
+	bbr->probe_wait_us = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_INIT;
+	bbr->rounds_since_probe = 0;
+	bbr->bw_probe_samples = 0;
+	bbr->prev_probe_too_high = 0;
+	bbr->ecn_eligible = 0;
+	bbr->ecn_alpha = bbr->params.ecn_alpha_init;
+	bbr->alpha_last_delivered = 0;
+	bbr->alpha_last_delivered_ce = 0;
+
+	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
+
+	if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable)
+		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
+}
+
+/* Core TCP stack informs us that the given skb was just marked lost. */
+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+	struct rate_sample rs;
+
+	/* Capture "current" data over the full round trip of loss,
+	 * to have a better chance to see the full capacity of the path.
+	*/
+	if (!bbr->loss_in_round)  /* first loss in this round trip? */
+		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
+	bbr->loss_in_round = 1;
+	bbr->loss_in_cycle = 1;
+
+	if (!bbr->bw_probe_samples)
+		return;  /* not an skb sent while probing for bandwidth */
+	if (unlikely(!scb->tx.delivered_mstamp))
+		return;  /* skb was SACKed, reneged, marked lost; ignore it */
+	/* We are probing for bandwidth. Construct a rate sample that
+	 * estimates what happened in the flight leading up to this lost skb,
+	 * then see if the loss rate went too high, and if so at which packet.
+	 */
+	memset(&rs, 0, sizeof(rs));
+	rs.tx_in_flight = scb->tx.in_flight;
+	rs.lost = tp->lost - scb->tx.lost;
+	rs.is_app_limited = scb->tx.is_app_limited;
+	if (bbr2_is_inflight_too_high(sk, &rs)) {
+		rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
+		bbr2_handle_inflight_too_high(sk, &rs);
+	}
+}
+
+/* Revert short-term model if current loss recovery event was spurious. */
+static u32 bbr2_undo_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->debug.undo = 1;
+	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+	bbr->full_bw_cnt = 0;
+	bbr->loss_in_round = 0;
+
+	if (!bbr->params.undo)
+		return tp->snd_cwnd;
+
+	/* Revert to cwnd and other state saved before loss episode. */
+	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
+	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
+	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
+	return bbr->prior_cwnd;
+}
+
+/* Entering loss recovery, so save state for when we undo recovery. */
+static u32 bbr2_ssthresh(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_save_cwnd(sk);
+	/* For undo, save state that adapts based on loss signal. */
+	bbr->undo_bw_lo		= bbr->bw_lo;
+	bbr->undo_inflight_lo	= bbr->inflight_lo;
+	bbr->undo_inflight_hi	= bbr->inflight_hi;
+	return tcp_sk(sk)->snd_ssthresh;
+}
+
+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
+{
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		return BBR2_PHASE_STARTUP;
+	case BBR_DRAIN:
+		return BBR2_PHASE_DRAIN;
+	case BBR_PROBE_BW:
+		break;
+	case BBR_PROBE_RTT:
+		return BBR2_PHASE_PROBE_RTT;
+	default:
+		return BBR2_PHASE_INVALID;
+	}
+	switch (bbr->cycle_idx) {
+	case BBR_BW_PROBE_UP:
+		return BBR2_PHASE_PROBE_BW_UP;
+	case BBR_BW_PROBE_DOWN:
+		return BBR2_PHASE_PROBE_BW_DOWN;
+	case BBR_BW_PROBE_CRUISE:
+		return BBR2_PHASE_PROBE_BW_CRUISE;
+	case BBR_BW_PROBE_REFILL:
+		return BBR2_PHASE_PROBE_BW_REFILL;
+	default:
+		return BBR2_PHASE_INVALID;
+	}
+}
+
+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
+			    union tcp_cc_info *info)
+{
+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct bbr *bbr = inet_csk_ca(sk);
+		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
+		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
+		u64 bw_lo = bbr->bw_lo == ~0U ?
+			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
+
+		memset(&info->bbr2, 0, sizeof(info->bbr2));
+		info->bbr2.bbr_bw_lsb		= (u32)bw;
+		info->bbr2.bbr_bw_msb		= (u32)(bw >> 32);
+		info->bbr2.bbr_min_rtt		= bbr->min_rtt_us;
+		info->bbr2.bbr_pacing_gain	= bbr->pacing_gain;
+		info->bbr2.bbr_cwnd_gain	= bbr->cwnd_gain;
+		info->bbr2.bbr_bw_hi_lsb	= (u32)bw_hi;
+		info->bbr2.bbr_bw_hi_msb	= (u32)(bw_hi >> 32);
+		info->bbr2.bbr_bw_lo_lsb	= (u32)bw_lo;
+		info->bbr2.bbr_bw_lo_msb	= (u32)(bw_lo >> 32);
+		info->bbr2.bbr_mode		= bbr->mode;
+		info->bbr2.bbr_phase		= (__u8)bbr2_get_phase(bbr);
+		info->bbr2.bbr_version		= (__u8)2;
+		info->bbr2.bbr_inflight_lo	= bbr->inflight_lo;
+		info->bbr2.bbr_inflight_hi	= bbr->inflight_hi;
+		info->bbr2.bbr_extra_acked	= bbr_extra_acked(sk);
+		*attr = INET_DIAG_BBRINFO;
+		return sizeof(info->bbr2);
+	}
+	return 0;
+}
+
+static void bbr2_set_state(struct sock *sk, u8 new_state)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		struct rate_sample rs = { .losses = 1 };
+		struct bbr_context ctx = { 0 };
+
+		bbr->prev_ca_state = TCP_CA_Loss;
+		bbr->full_bw = 0;
+		if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
+			/* bbr_adapt_lower_bounds() needs cwnd before
+			 * we suffered an RTO, to update inflight_lo:
+			 */
+			bbr->inflight_lo =
+				max(tp->snd_cwnd, bbr->prior_cwnd);
+		}
+		bbr_debug(sk, 0, &rs, &ctx);
+	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
+		   new_state != TCP_CA_Loss) {
+		tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+		bbr->try_fast_path = 0; /* bound cwnd using latest model */
+	}
+}
+
+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
+	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+	.name		= "bbr2",
+	.owner		= THIS_MODULE,
+	.init		= bbr2_init,
+	.cong_control	= bbr2_main,
+	.sndbuf_expand	= bbr_sndbuf_expand,
+	.skb_marked_lost = bbr2_skb_marked_lost,
+	.undo_cwnd	= bbr2_undo_cwnd,
+	.cwnd_event	= bbr_cwnd_event,
+	.ssthresh	= bbr2_ssthresh,
+	.tso_segs	= bbr_tso_segs,
+	.get_info	= bbr2_get_info,
+	.set_state	= bbr2_set_state,
+};
+
+static int __init bbr_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c31..0d6d1a949e11 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	tcp_sk(sk)->prior_ssthresh = 0;
+	tcp_sk(sk)->fast_ack_mode = 0;
 	if (icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
 	if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dee174c40e87..5c6ce3971dbf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 			tcp_enter_quickack_mode(sk, 2);
 		break;
 	case INET_ECN_CE:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
@@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
 	default:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
@@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
  */
 static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
 {
+	struct sock *sk = (struct sock *)tp;
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
 	tp->lost += tcp_skb_pcount(skb);
+	if (ca_ops->skb_marked_lost)
+		ca_ops->skb_marked_lost(sk, skb);
 }
 
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
@@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
 	tcp_skb_pcount_add(skb, -pcount);
 
+	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
+	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
+		      "prev in_flight: %u skb in_flight: %u pcount: %u",
+		      TCP_SKB_CB(prev)->tx.in_flight,
+		      TCP_SKB_CB(skb)->tx.in_flight,
+		      pcount))
+		TCP_SKB_CB(skb)->tx.in_flight = 0;
+	else
+		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
+	TCP_SKB_CB(prev)->tx.in_flight += pcount;
+
 	/* When we're adding to gso_segs == 1, gso_size will be zero,
 	 * in theory this shouldn't be necessary but as long as DSACK
 	 * code can come after this skb later on it's better to keep
@@ -3813,6 +3829,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
+	tcp_rate_check_app_limited(sk);
 
 	/* ts_recent update must be made after we are sure that the packet
 	 * is in window.
@@ -3911,6 +3928,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	delivered = tcp_newly_delivered(sk, delivered, flag);
 	lost = tp->lost - lost;			/* freshly marked lost */
 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+	rs.is_ece = !!(flag & FLAG_ECE);
 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
@@ -5521,13 +5539,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	     (tp->fast_ack_mode == 1 ||
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise).
 	      * If application uses SO_RCVLOWAT, we want send ack now if
 	      * we have not received enough bytes to satisfy the condition.
 	      */
-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* Protocol state mandates a one-time immediate ACK */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ba839e441450..5ffec885e66f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
 				th->cwr = 1;
 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 			}
-		} else if (!tcp_ca_needs_ecn(sk)) {
+		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
+			!tcp_ca_needs_ecn(sk)) {
 			/* ACK or retransmitted segment: clear ECT|CE */
 			INET_ECN_dontxmit(sk);
 		}
@@ -1530,7 +1531,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
-	int nsize, old_factor;
+	int nsize, old_factor, inflight_prev;
 	long limit;
 	int nlen;
 	u8 flags;
@@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 
 		if (diff)
 			tcp_adjust_pcount(sk, skb, diff);
+
+		/* Set buff tx.in_flight as if buff were sent by itself. */
+		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
+		if (WARN_ONCE(inflight_prev < 0,
+			      "inconsistent: tx.in_flight: %u old_factor: %d",
+			      TCP_SKB_CB(skb)->tx.in_flight, old_factor))
+			inflight_prev = 0;
+		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
+						 tcp_skb_pcount(buff);
 	}
 
 	/* Link BUFF into the send queue. */
@@ -1990,13 +2000,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-	u32 min_tso, tso_segs;
-
-	min_tso = ca_ops->min_tso_segs ?
-			ca_ops->min_tso_segs(sk) :
-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	u32 tso_segs;
 
-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	tso_segs = ca_ops->tso_segs ?
+		ca_ops->tso_segs(sk, mss_now) :
+		tcp_tso_autosize(sk, mss_now,
+				 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs));
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 
@@ -2632,6 +2641,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 			tcp_init_tso_segs(skb, mss_now);
+			tcp_set_tx_in_flight(sk, skb);
 			goto repair; /* Skip network transmission */
 		}
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index a8f6d9d06f2e..a8b4c9504570 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -34,6 +34,24 @@
  * ready to send in the write queue.
  */
 
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 in_flight;
+
+	/* Check, sanitize, and record packets in flight after skb was sent. */
+	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+		      "insane in_flight %u cc %s mss %u "
+		      "cwnd %u pif %u %u %u %u\n",
+		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
+		      tp->mss_cache, tp->snd_cwnd,
+		      tp->packets_out, tp->retrans_out,
+		      tp->sacked_out, tp->lost_out))
+		in_flight = TCPCB_IN_FLIGHT_MAX;
+	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
+}
+
 /* Snapshot the current delivery information in the skb, to generate
  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
  */
@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
+	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
+	tcp_set_tx_in_flight(sk, skb);
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	if (!rs->prior_delivered ||
 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
 			       scb->end_seq, rs->last_end_seq)) {
+		rs->prior_lost	     = scb->tx.lost;
 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
 		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
 		rs->last_end_seq     = scb->end_seq;
+		rs->tx_in_flight     = scb->tx.in_flight;
 
 		/* Record send time of most recently ACKed packet: */
 		tp->first_tx_mstamp  = tx_tstamp;
 		/* Find the duration of the "send phase" of this window: */
-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
-						     scb->tx.first_tx_mstamp);
+		rs->interval_us      = tcp_stamp32_us_delta(
+						tp->first_tx_mstamp,
+						scb->tx.first_tx_mstamp);
 
 	}
 	/* Mark off the skb delivered once it's sacked to avoid being
@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		return;
 	}
 	rs->delivered   = tp->delivered - rs->prior_delivered;
+	rs->lost        = tp->lost - rs->prior_lost;
 
 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
 	/* delivered_ce occupies less than 32 bits in the skb control block */
@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * longer phase.
 	 */
 	snd_us = rs->interval_us;				/* send phase */
-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
 				    rs->prior_mstamp); /* ack phase */
 	rs->interval_us = max(snd_us, ack_us);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0b5d0a2867a8..c2bc56da653c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -615,6 +615,7 @@ void tcp_write_timer_handler(struct sock *sk)
 		return;
 	}
 
+	tcp_rate_check_app_limited(sk);
 	tcp_mstamp_refresh(tcp_sk(sk));
 	event = icsk->icsk_pending;
 
-- 
2.41.0

From 3795ed7f7992f4d0a04466cbd8625b88f233a376 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Thu, 25 May 2023 17:26:50 +0200
Subject: [PATCH 02/12] bfq

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 MAINTAINERS         |  2 +-
 block/bfq-iosched.c | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c6545eb54104..ee59cc25b910 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3532,7 +3532,7 @@ F:	Documentation/filesystems/befs.rst
 F:	fs/befs/
 
 BFQ I/O SCHEDULER
-M:	Paolo Valente <paolo.valente@linaro.org>
+M:	Paolo Valente <paolo.valente@unimore.it>
 M:	Jens Axboe <axboe@kernel.dk>
 L:	linux-block@vger.kernel.org
 S:	Maintained
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index bac977da4eb5..abeb6e560bed 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5404,6 +5404,10 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->bfqd->last_completed_rq_bfqq == bfqq)
 		bfqq->bfqd->last_completed_rq_bfqq = NULL;
 
+	WARN_ON_ONCE(!list_empty(&bfqq->fifo));
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list));
+	WARN_ON_ONCE(bfqq->dispatched);
+
 	kmem_cache_free(bfq_pool, bfqq);
 	bfqg_and_blkg_put(bfqg);
 }
@@ -7136,6 +7140,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 {
 	struct bfq_data *bfqd = e->elevator_data;
 	struct bfq_queue *bfqq, *n;
+	unsigned int actuator;
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
@@ -7144,6 +7149,10 @@ static void bfq_exit_queue(struct elevator_queue *e)
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	spin_unlock_irq(&bfqd->lock);
 
+	for (actuator = 0; actuator < bfqd->num_actuators; actuator++)
+		WARN_ON_ONCE(bfqd->rq_in_driver[actuator]);
+	WARN_ON_ONCE(bfqd->tot_rq_in_driver);
+
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
 	/* release oom-queue reference to root group */
@@ -7619,6 +7628,7 @@ MODULE_ALIAS("bfq-iosched");
 static int __init bfq_init(void)
 {
 	int ret;
+	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.3";
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	ret = blkcg_policy_register(&blkcg_policy_bfq);
@@ -7650,6 +7660,11 @@ static int __init bfq_init(void)
 	if (ret)
 		goto slab_kill;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	strcat(msg, " (with cgroups support)");
+#endif
+	pr_info("%s", msg);
+
 	return 0;
 
 slab_kill:
-- 
2.41.0

From 3a914995866b9ffd940a8573af7e5228f2068ed0 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Fri, 9 Jun 2023 20:00:54 +0200
Subject: [PATCH 03/12] cachy

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .gitignore                                    |    1 +
 .../admin-guide/kernel-parameters.txt         |   12 +
 Documentation/dontdiff                        |    1 +
 Makefile                                      |    8 +-
 arch/arc/configs/axs101_defconfig             |    1 +
 arch/arc/configs/axs103_defconfig             |    1 +
 arch/arc/configs/axs103_smp_defconfig         |    1 +
 arch/arc/configs/haps_hs_defconfig            |    1 +
 arch/arc/configs/haps_hs_smp_defconfig        |    1 +
 arch/arc/configs/hsdk_defconfig               |    1 +
 arch/arc/configs/nsim_700_defconfig           |    1 +
 arch/arc/configs/nsimosci_defconfig           |    1 +
 arch/arc/configs/nsimosci_hs_defconfig        |    1 +
 arch/arc/configs/nsimosci_hs_smp_defconfig    |    1 +
 arch/arc/configs/tb10x_defconfig              |    1 +
 arch/arc/configs/vdk_hs38_defconfig           |    1 +
 arch/arc/configs/vdk_hs38_smp_defconfig       |    1 +
 arch/x86/Kconfig.cpu                          |  427 ++-
 arch/x86/Makefile                             |   46 +-
 arch/x86/Makefile.postlink                    |   41 +
 arch/x86/boot/compressed/.gitignore           |    1 -
 arch/x86/boot/compressed/Makefile             |   10 +-
 arch/x86/include/asm/pci.h                    |    6 +
 arch/x86/include/asm/vermagic.h               |   74 +
 arch/x86/pci/common.c                         |    7 +-
 drivers/Makefile                              |   15 +-
 drivers/ata/ahci.c                            |   23 +-
 drivers/cpufreq/Kconfig.x86                   |    2 -
 drivers/cpufreq/intel_pstate.c                |    2 +
 drivers/i2c/busses/Kconfig                    |    9 +
 drivers/i2c/busses/Makefile                   |    1 +
 drivers/i2c/busses/i2c-nct6775.c              |  647 ++++
 drivers/i2c/busses/i2c-piix4.c                |    4 +-
 drivers/md/dm-crypt.c                         |    5 +
 drivers/pci/controller/Makefile               |    6 +
 drivers/pci/controller/intel-nvme-remap.c     |  462 +++
 drivers/pci/quirks.c                          |  101 +
 drivers/platform/x86/Kconfig                  |   24 +
 drivers/platform/x86/Makefile                 |    4 +
 drivers/platform/x86/legion-laptop.c          | 2783 +++++++++++++++++
 drivers/platform/x86/steamdeck.c              |  523 ++++
 include/linux/mm.h                            |    2 +-
 include/linux/pagemap.h                       |    2 +-
 include/linux/user_namespace.h                |    4 +
 init/Kconfig                                  |   39 +
 kernel/Kconfig.hz                             |   24 +
 kernel/fork.c                                 |   14 +
 kernel/module/Kconfig                         |   25 +
 kernel/sched/fair.c                           |   20 +-
 kernel/sysctl.c                               |   12 +
 kernel/user_namespace.c                       |    7 +
 mm/Kconfig                                    |    2 +-
 mm/compaction.c                               |    4 +
 mm/page-writeback.c                           |    8 +
 mm/swap.c                                     |    5 +
 mm/vmpressure.c                               |    4 +
 mm/vmscan.c                                   |    8 +
 scripts/Makefile.lib                          |   13 +-
 scripts/Makefile.modinst                      |    7 +-
 59 files changed, 5393 insertions(+), 65 deletions(-)
 create mode 100644 arch/x86/Makefile.postlink
 create mode 100644 drivers/i2c/busses/i2c-nct6775.c
 create mode 100644 drivers/pci/controller/intel-nvme-remap.c
 create mode 100644 drivers/platform/x86/legion-laptop.c
 create mode 100644 drivers/platform/x86/steamdeck.c

diff --git a/.gitignore b/.gitignore
index 70ec6037fa7a..9bafd3c6bb5f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,7 @@ modules.order
 /vmlinux
 /vmlinux.32
 /vmlinux.map
+/vmlinux.relocs
 /vmlinux.symvers
 /vmlinux-gdb.py
 /vmlinuz
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7016cb12dc4e..97303fa40350 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2084,6 +2084,9 @@
 			disable
 			  Do not enable intel_pstate as the default
 			  scaling driver for the supported processors
+			enable
+			  Enable intel_pstate in-case "disable" was passed
+			  previously in the kernel boot parameters
 			passive
 			  Use intel_pstate as a scaling driver, but configure it
 			  to work with generic cpufreq governors (instead of
@@ -4190,6 +4193,15 @@
 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
 				enabled, this kernel boot option can be used to
 				disable the use of MSI interrupts system-wide.
+		pcie_acs_override =
+					[PCIE] Override missing PCIe ACS support for:
+				downstream
+					All downstream ports - full ACS capabilities
+				multfunction
+					All multifunction devices - multifunction ACS subset
+				id:nnnn:nnnn
+					Specfic device - full ACS capabilities
+					Specified as vid:did (vendor/device ID) in hex
 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
 				Safety option to keep boot IRQs enabled. This
 				should never be necessary.
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 3c399f132e2d..a62ad01e6d11 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -254,6 +254,7 @@ vmlinux.aout
 vmlinux.bin.all
 vmlinux.lds
 vmlinux.map
+vmlinux.relocs
 vmlinux.symvers
 vmlinuz
 voffset.h
diff --git a/Makefile b/Makefile
index f8ee72375885..26e622ff9827 100644
--- a/Makefile
+++ b/Makefile
@@ -818,6 +818,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
 ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
 KBUILD_CFLAGS += -O2
 KBUILD_RUSTFLAGS += -Copt-level=2
+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
+KBUILD_CFLAGS += -O3
+KBUILD_RUSTFLAGS += -Copt-level=3
 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS += -Os
 KBUILD_RUSTFLAGS += -Copt-level=s
@@ -1060,11 +1063,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
 # Make sure -fstack-check isn't enabled (like gentoo apparently did)
 KBUILD_CFLAGS  += -fno-stack-check
 
-# conserve stack if available
-ifdef CONFIG_CC_IS_GCC
-KBUILD_CFLAGS   += -fconserve-stack
-endif
-
 # Prohibit date/time macros, which would make the build non-deterministic
 KBUILD_CFLAGS   += -Werror=date-time
 
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
index 81764160451f..2c15d3bf747a 100644
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
index d5181275490e..7d868e148d9a 100644
--- a/arch/arc/configs/axs103_defconfig
+++ b/arch/arc/configs/axs103_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
index 2f336d99a8cf..777a9f21eb6b 100644
--- a/arch/arc/configs/axs103_smp_defconfig
+++ b/arch/arc/configs/axs103_smp_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index 899b2fd5c71d..bda15a876849 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EXPERT=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index 0d32aac8069f..dbd74fea69aa 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
index d18378d2c2a6..2396ca417182 100644
--- a/arch/arc/configs/hsdk_defconfig
+++ b/arch/arc/configs/hsdk_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_BLK_DEV_RAM=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 3e9829775992..5044609540cc 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index 502c87f351c8..748c809d1c4c 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index f721cc3997d0..205c32b0074c 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 1419fc946a08..2477b7c80977 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index 6f0d2be9d926..cf02ad0fc210 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio"
 CONFIG_INITRAMFS_ROOT_UID=2100
 CONFIG_INITRAMFS_ROOT_GID=501
 # CONFIG_RD_GZIP is not set
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_AIO is not set
 CONFIG_EMBEDDED=y
diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig
index d3ef189c75f8..922b1b24f518 100644
--- a/arch/arc/configs/vdk_hs38_defconfig
+++ b/arch/arc/configs/vdk_hs38_defconfig
@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig
index 944b347025fd..ed64319f7eb2 100644
--- a/arch/arc/configs/vdk_hs38_smp_defconfig
+++ b/arch/arc/configs/vdk_hs38_smp_defconfig
@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 542377cd419d..c54f7f062288 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -157,7 +157,7 @@ config MPENTIUM4
 
 
 config MK6
-	bool "K6/K6-II/K6-III"
+	bool "AMD K6/K6-II/K6-III"
 	depends on X86_32
 	help
 	  Select this for an AMD K6-family processor.  Enables use of
@@ -165,7 +165,7 @@ config MK6
 	  flags to GCC.
 
 config MK7
-	bool "Athlon/Duron/K7"
+	bool "AMD Athlon/Duron/K7"
 	depends on X86_32
 	help
 	  Select this for an AMD Athlon K7-family processor.  Enables use of
@@ -173,12 +173,106 @@ config MK7
 	  flags to GCC.
 
 config MK8
-	bool "Opteron/Athlon64/Hammer/K8"
+	bool "AMD Opteron/Athlon64/Hammer/K8"
 	help
 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
 	  Enables use of some extended instructions, and passes appropriate
 	  optimization flags to GCC.
 
+config MK8SSE3
+	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+	help
+	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MK10
+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+	help
+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+	  Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MBARCELONA
+	bool "AMD Barcelona"
+	help
+	  Select this for AMD Family 10h Barcelona processors.
+
+	  Enables -march=barcelona
+
+config MBOBCAT
+	bool "AMD Bobcat"
+	help
+	  Select this for AMD Family 14h Bobcat processors.
+
+	  Enables -march=btver1
+
+config MJAGUAR
+	bool "AMD Jaguar"
+	help
+	  Select this for AMD Family 16h Jaguar processors.
+
+	  Enables -march=btver2
+
+config MBULLDOZER
+	bool "AMD Bulldozer"
+	help
+	  Select this for AMD Family 15h Bulldozer processors.
+
+	  Enables -march=bdver1
+
+config MPILEDRIVER
+	bool "AMD Piledriver"
+	help
+	  Select this for AMD Family 15h Piledriver processors.
+
+	  Enables -march=bdver2
+
+config MSTEAMROLLER
+	bool "AMD Steamroller"
+	help
+	  Select this for AMD Family 15h Steamroller processors.
+
+	  Enables -march=bdver3
+
+config MEXCAVATOR
+	bool "AMD Excavator"
+	help
+	  Select this for AMD Family 15h Excavator processors.
+
+	  Enables -march=bdver4
+
+config MZEN
+	bool "AMD Zen"
+	help
+	  Select this for AMD Family 17h Zen processors.
+
+	  Enables -march=znver1
+
+config MZEN2
+	bool "AMD Zen 2"
+	help
+	  Select this for AMD Family 17h Zen 2 processors.
+
+	  Enables -march=znver2
+
+config MZEN3
+	bool "AMD Zen 3"
+	depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+	  Select this for AMD Family 19h Zen 3 processors.
+
+	  Enables -march=znver3
+
+config MZEN4
+	bool "AMD Zen 4"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
+	help
+	  Select this for AMD Family 19h Zen 4 processors.
+
+	  Enables -march=znver4
+
 config MCRUSOE
 	bool "Crusoe"
 	depends on X86_32
@@ -270,7 +364,7 @@ config MPSC
 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
 
 config MCORE2
-	bool "Core 2/newer Xeon"
+	bool "Intel Core 2"
 	help
 
 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -278,6 +372,8 @@ config MCORE2
 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
 	  (not a typo)
 
+	  Enables -march=core2
+
 config MATOM
 	bool "Intel Atom"
 	help
@@ -287,6 +383,212 @@ config MATOM
 	  accordingly optimized code. Use a recent GCC with specific Atom
 	  support in order to fully benefit from selecting this option.
 
+config MNEHALEM
+	bool "Intel Nehalem"
+	select X86_P6_NOP
+	help
+
+	  Select this for 1st Gen Core processors in the Nehalem family.
+
+	  Enables -march=nehalem
+
+config MWESTMERE
+	bool "Intel Westmere"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Westmere formerly Nehalem-C family.
+
+	  Enables -march=westmere
+
+config MSILVERMONT
+	bool "Intel Silvermont"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Silvermont platform.
+
+	  Enables -march=silvermont
+
+config MGOLDMONT
+	bool "Intel Goldmont"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
+
+	  Enables -march=goldmont
+
+config MGOLDMONTPLUS
+	bool "Intel Goldmont Plus"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
+
+	  Enables -march=goldmont-plus
+
+config MSANDYBRIDGE
+	bool "Intel Sandy Bridge"
+	select X86_P6_NOP
+	help
+
+	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+	  Enables -march=sandybridge
+
+config MIVYBRIDGE
+	bool "Intel Ivy Bridge"
+	select X86_P6_NOP
+	help
+
+	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+	  Enables -march=ivybridge
+
+config MHASWELL
+	bool "Intel Haswell"
+	select X86_P6_NOP
+	help
+
+	  Select this for 4th Gen Core processors in the Haswell family.
+
+	  Enables -march=haswell
+
+config MBROADWELL
+	bool "Intel Broadwell"
+	select X86_P6_NOP
+	help
+
+	  Select this for 5th Gen Core processors in the Broadwell family.
+
+	  Enables -march=broadwell
+
+config MSKYLAKE
+	bool "Intel Skylake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake family.
+
+	  Enables -march=skylake
+
+config MSKYLAKEX
+	bool "Intel Skylake X"
+	select X86_P6_NOP
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake X family.
+
+	  Enables -march=skylake-avx512
+
+config MCANNONLAKE
+	bool "Intel Cannon Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 8th Gen Core processors
+
+	  Enables -march=cannonlake
+
+config MICELAKE
+	bool "Intel Ice Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 10th Gen Core processors in the Ice Lake family.
+
+	  Enables -march=icelake-client
+
+config MCASCADELAKE
+	bool "Intel Cascade Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for Xeon processors in the Cascade Lake family.
+
+	  Enables -march=cascadelake
+
+config MCOOPERLAKE
+	bool "Intel Cooper Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	select X86_P6_NOP
+	help
+
+	  Select this for Xeon processors in the Cooper Lake family.
+
+	  Enables -march=cooperlake
+
+config MTIGERLAKE
+	bool "Intel Tiger Lake"
+	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	select X86_P6_NOP
+	help
+
+	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
+
+	  Enables -march=tigerlake
+
+config MSAPPHIRERAPIDS
+	bool "Intel Sapphire Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
+
+	  Enables -march=sapphirerapids
+
+config MROCKETLAKE
+	bool "Intel Rocket Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for eleventh-generation processors in the Rocket Lake family.
+
+	  Enables -march=rocketlake
+
+config MALDERLAKE
+	bool "Intel Alder Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for twelfth-generation processors in the Alder Lake family.
+
+	  Enables -march=alderlake
+
+config MRAPTORLAKE
+	bool "Intel Raptor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for thirteenth-generation processors in the Raptor Lake family.
+
+	  Enables -march=raptorlake
+
+config MMETEORLAKE
+	bool "Intel Meteor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for fourteenth-generation processors in the Meteor Lake family.
+
+	  Enables -march=meteorlake
+
+config MEMERALDRAPIDS
+	bool "Intel Emerald Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
+
+	  Enables -march=emeraldrapids
+
 config GENERIC_CPU
 	bool "Generic-x86-64"
 	depends on X86_64
@@ -294,6 +596,50 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config GENERIC_CPU2
+	bool "Generic-x86-64-v2"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64 CPU.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v2.
+
+config GENERIC_CPU3
+	bool "Generic-x86-64-v3"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64-v3 CPU with v3 instructions.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v3.
+
+config GENERIC_CPU4
+	bool "Generic-x86-64-v4"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64 CPU with v4 instructions.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v4.
+
+config MNATIVE_INTEL
+	bool "Intel-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for AMD CPUs.  Intel Only!
+
+	  Enables -march=native
+
+config MNATIVE_AMD
+	bool "AMD-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for Intel CPUs.  AMD Only!
+
+	  Enables -march=native
+
 endchoice
 
 config X86_GENERIC
@@ -318,9 +664,17 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \
+	|| MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 \
+	|| GENERIC_CPU3 || GENERIC_CPU4
 	default "4" if MELAN || M486SX || M486 || MGEODEGX1
-	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \
+	|| MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
 config X86_F00F_BUG
 	def_bool y
@@ -332,15 +686,27 @@ config X86_INVD_BUG
 
 config X86_ALIGNMENT_16
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \
+	|| M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
 
 config X86_INTEL_USERCOPY
 	def_bool y
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \
+	|| MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
 
 config X86_USE_PPRO_CHECKSUM
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
+	|| MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \
+	|| MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \
+	|| MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \
+	|| MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 #
 # P6_NOPs are a relatively minor optimization that require a family >=
@@ -356,32 +722,63 @@ config X86_USE_PPRO_CHECKSUM
 config X86_P6_NOP
 	def_bool y
 	depends on X86_64
-	depends on (MCORE2 || MPENTIUM4 || MPSC)
+	depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \
+	|| MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \
+	|| MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
+	|| MNATIVE_INTEL)
 
 config X86_TSC
 	def_bool y
-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
+	|| MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \
+	|| MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \
+	|| MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
+	|| MNATIVE_INTEL || MNATIVE_AMD) || X86_64
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
+	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \
+	|| MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \
+	|| MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \
+	|| MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.
 config X86_CMOV
 	def_bool y
-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \
+	|| MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD)
 
 config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
-	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
+	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 ||  MK8SSE3 \
+	|| MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \
+	|| MNATIVE_INTEL || MNATIVE_AMD)
 	default "5" if X86_32 && X86_CMPXCHG64
 	default "4"
 
 config X86_DEBUGCTLMSR
 	def_bool y
-	depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML
+	depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \
+	|| M486SX || M486) && !UML
 
 config IA32_FEAT_CTL
 	def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b39975977c03..eae5bcbfed69 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -67,7 +67,7 @@ export BITS
 #
 #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
 #
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -O3 -fno-tree-vectorize
 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
@@ -151,8 +151,48 @@ else
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8)		+= -march=k8
         cflags-$(CONFIG_MPSC)		+= -march=nocona
-        cflags-$(CONFIG_MCORE2)		+= -march=core2
-        cflags-$(CONFIG_MATOM)		+= -march=atom
+        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
+        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
+        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
+        cflags-$(CONFIG_MBOBCAT) 	+= -march=btver1
+        cflags-$(CONFIG_MJAGUAR) 	+= -march=btver2
+        cflags-$(CONFIG_MBULLDOZER) 	+= -march=bdver1
+        cflags-$(CONFIG_MPILEDRIVER)	+= -march=bdver2 -mno-tbm
+        cflags-$(CONFIG_MSTEAMROLLER) 	+= -march=bdver3 -mno-tbm
+        cflags-$(CONFIG_MEXCAVATOR) 	+= -march=bdver4 -mno-tbm
+        cflags-$(CONFIG_MZEN) 		+= -march=znver1
+        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
+        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
+        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
+        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
+        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native
+        cflags-$(CONFIG_MATOM) 	+= -march=bonnell
+        cflags-$(CONFIG_MCORE2) 	+= -march=core2
+        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
+        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
+        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
+        cflags-$(CONFIG_MGOLDMONT) 	+= -march=goldmont
+        cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
+        cflags-$(CONFIG_MSANDYBRIDGE) 	+= -march=sandybridge
+        cflags-$(CONFIG_MIVYBRIDGE) 	+= -march=ivybridge
+        cflags-$(CONFIG_MHASWELL) 	+= -march=haswell
+        cflags-$(CONFIG_MBROADWELL) 	+= -march=broadwell
+        cflags-$(CONFIG_MSKYLAKE) 	+= -march=skylake
+        cflags-$(CONFIG_MSKYLAKEX) 	+= -march=skylake-avx512
+        cflags-$(CONFIG_MCANNONLAKE) 	+= -march=cannonlake
+        cflags-$(CONFIG_MICELAKE) 	+= -march=icelake-client
+        cflags-$(CONFIG_MCASCADELAKE) 	+= -march=cascadelake
+        cflags-$(CONFIG_MCOOPERLAKE) 	+= -march=cooperlake
+        cflags-$(CONFIG_MTIGERLAKE) 	+= -march=tigerlake
+        cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
+        cflags-$(CONFIG_MROCKETLAKE) 	+= -march=rocketlake
+        cflags-$(CONFIG_MALDERLAKE) 	+= -march=alderlake
+        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
+        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
+        cflags-$(CONFIG_MEMERALDRAPIDS)	+= -march=emeraldrapids
+        cflags-$(CONFIG_GENERIC_CPU2) 	+= -march=x86-64-v2
+        cflags-$(CONFIG_GENERIC_CPU3) 	+= -march=x86-64-v3
+        cflags-$(CONFIG_GENERIC_CPU4) 	+= -march=x86-64-v4
         cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
         KBUILD_CFLAGS += $(cflags-y)
 
diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink
new file mode 100644
index 000000000000..195af937aa4d
--- /dev/null
+++ b/arch/x86/Makefile.postlink
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0
+# ===========================================================================
+# Post-link x86 pass
+# ===========================================================================
+#
+# 1. Separate relocations from vmlinux into vmlinux.relocs.
+# 2. Strip relocations from vmlinux.
+
+PHONY := __archpost
+__archpost:
+
+-include include/config/auto.conf
+include $(srctree)/scripts/Kbuild.include
+
+CMD_RELOCS = arch/x86/tools/relocs
+quiet_cmd_relocs = RELOCS  $@.relocs
+      cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@
+
+quiet_cmd_strip_relocs = RSTRIP  $@
+      cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@
+
+# `@true` prevents complaint when there is nothing to be done
+
+vmlinux: FORCE
+	@true
+ifeq ($(CONFIG_X86_NEED_RELOCS),y)
+	$(call cmd,relocs)
+	$(call cmd,strip_relocs)
+endif
+
+%.ko: FORCE
+	@true
+
+clean:
+	@rm -f vmlinux.relocs
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 25805199a506..b2968175fc27 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 relocs
 vmlinux.bin.all
-vmlinux.relocs
 vmlinux.lds
 mkpiggy
 piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 6b6cfe607bdb..19d1fb601796 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 
 targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
 
-CMD_RELOCS = arch/x86/tools/relocs
-quiet_cmd_relocs = RELOCS  $@
-      cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
-$(obj)/vmlinux.relocs: vmlinux FORCE
-	$(call if_changed,relocs)
+# vmlinux.relocs is created by the vmlinux postlink step.
+vmlinux.relocs: vmlinux
+	@true
 
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs
 
 $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,gzip)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b40c462b4af3..c4e66e60d559 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -27,6 +27,7 @@ struct pci_sysdata {
 #if IS_ENABLED(CONFIG_VMD)
 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
+	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
 };
 
 extern int pci_routeirq;
@@ -70,6 +71,11 @@ static inline bool is_vmd(struct pci_bus *bus)
 #define is_vmd(bus)		false
 #endif /* CONFIG_VMD */
 
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
index 75884d2cdec3..02c1386eb653 100644
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -17,6 +17,54 @@
 #define MODULE_PROC_FAMILY "586MMX "
 #elif defined CONFIG_MCORE2
 #define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE_INTEL
+#define MODULE_PROC_FAMILY "NATIVE_INTEL "
+#elif defined CONFIG_MNATIVE_AMD
+#define MODULE_PROC_FAMILY "NATIVE_AMD "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MGOLDMONT
+#define MODULE_PROC_FAMILY "GOLDMONT "
+#elif defined CONFIG_MGOLDMONTPLUS
+#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
+#elif defined CONFIG_MSKYLAKEX
+#define MODULE_PROC_FAMILY "SKYLAKEX "
+#elif defined CONFIG_MCANNONLAKE
+#define MODULE_PROC_FAMILY "CANNONLAKE "
+#elif defined CONFIG_MICELAKE
+#define MODULE_PROC_FAMILY "ICELAKE "
+#elif defined CONFIG_MCASCADELAKE
+#define MODULE_PROC_FAMILY "CASCADELAKE "
+#elif defined CONFIG_MCOOPERLAKE
+#define MODULE_PROC_FAMILY "COOPERLAKE "
+#elif defined CONFIG_MTIGERLAKE
+#define MODULE_PROC_FAMILY "TIGERLAKE "
+#elif defined CONFIG_MSAPPHIRERAPIDS
+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
+#elif defined CONFIG_ROCKETLAKE
+#define MODULE_PROC_FAMILY "ROCKETLAKE "
+#elif defined CONFIG_MALDERLAKE
+#define MODULE_PROC_FAMILY "ALDERLAKE "
+#elif defined CONFIG_MRAPTORLAKE
+#define MODULE_PROC_FAMILY "RAPTORLAKE "
+#elif defined CONFIG_MMETEORLAKE
+#define MODULE_PROC_FAMILY "METEORLAKE "
+#elif defined CONFIG_MEMERALDRAPIDS
+#define MODULE_PROC_FAMILY "EMERALDRAPIDS "
 #elif defined CONFIG_MATOM
 #define MODULE_PROC_FAMILY "ATOM "
 #elif defined CONFIG_M686
@@ -35,6 +83,32 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
+#elif defined CONFIG_MEXCAVATOR
+#define MODULE_PROC_FAMILY "EXCAVATOR "
+#elif defined CONFIG_MZEN
+#define MODULE_PROC_FAMILY "ZEN "
+#elif defined CONFIG_MZEN2
+#define MODULE_PROC_FAMILY "ZEN2 "
+#elif defined CONFIG_MZEN3
+#define MODULE_PROC_FAMILY "ZEN3 "
+#elif defined CONFIG_MZEN4
+#define MODULE_PROC_FAMILY "ZEN4 "
 #elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201..7c20387d8202 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
 		return 0;
 }
 
-#if IS_ENABLED(CONFIG_VMD)
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 {
+#if IS_ENABLED(CONFIG_VMD)
 	if (is_vmd(dev->bus))
 		return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+	if (is_nvme_remap(dev->bus))
+		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
 
 	return dev;
 }
-#endif
diff --git a/drivers/Makefile b/drivers/Makefile
index 20b118dca999..c19dee206e53 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,15 +64,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb and intelfb depend on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -84,6 +77,14 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb and intelfb depend on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 14a1c0d14916..7f5a77ddc7d4 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1522,7 +1522,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
 }
 #endif
 
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
 		struct ahci_host_priv *hpriv)
 {
 	int i;
@@ -1535,7 +1535,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	    pci_resource_len(pdev, bar) < SZ_512K ||
 	    bar != AHCI_PCI_BAR_STANDARD ||
 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
-		return;
+		return 0;
 
 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1550,18 +1550,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	}
 
 	if (!hpriv->remapped_nvme)
-		return;
-
-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
-		 hpriv->remapped_nvme);
-	dev_warn(&pdev->dev,
-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+		return 0;
 
-	/*
-	 * Don't rely on the msi-x capability in the remap case,
-	 * share the legacy interrupt across ahci and remapped devices.
-	 */
-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
+	/* Abort probe, allowing intel-nvme-remap to step in when available */
+	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+	return -ENODEV;
 }
 
 static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1781,7 +1774,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
 
 	/* detect remapped nvme devices */
-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	if (rc)
+		return rc;
 
 	sysfs_add_file_to_group(&pdev->dev.kobj,
 				&dev_attr_remapped_nvme.attr,
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 00476e94db90..c3a219218fac 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
 	select ACPI_PROCESSOR if ACPI
 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
 	select CPU_FREQ_GOV_PERFORMANCE
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
 	depends on X86 && ACPI
 	select ACPI_PROCESSOR
 	select ACPI_CPPC_LIB if X86_64
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver adds a CPUFreq driver which utilizes a fine grain
 	  processor performance frequency control range instead of legacy
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 48a4613cef1e..ad9414c32060 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3491,6 +3491,8 @@ static int __init intel_pstate_setup(char *str)
 
 	if (!strcmp(str, "disable"))
 		no_load = 1;
+	else if (!strcmp(str, "enable"))
+		no_load = 0;
 	else if (!strcmp(str, "active"))
 		default_driver = &intel_pstate;
 	else if (!strcmp(str, "passive"))
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 25eb4e8fd22f..2f95d74ad0b4 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -229,6 +229,15 @@ config I2C_CHT_WC
 	  combined with a FUSB302 Type-C port-controller as such it is advised
 	  to also select CONFIG_TYPEC_FUSB302=m.
 
+config I2C_NCT6775
+	tristate "Nuvoton NCT6775 and compatible SMBus controller"
+	help
+		If you say yes to this option, support will be included for the
+		Nuvoton NCT6775 and compatible SMBus controllers.
+
+		This driver can also be built as a module.  If so, the module
+		will be called i2c-nct6775.
+
 config I2C_NFORCE2
 	tristate "Nvidia nForce2, nForce3 and nForce4"
 	depends on PCI
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index af56fe2c75c0..76be74584719 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC)	+= i2c-cht-wc.o
 obj-$(CONFIG_I2C_I801)		+= i2c-i801.o
 obj-$(CONFIG_I2C_ISCH)		+= i2c-isch.o
 obj-$(CONFIG_I2C_ISMT)		+= i2c-ismt.o
+obj-$(CONFIG_I2C_NCT6775)   += i2c-nct6775.o
 obj-$(CONFIG_I2C_NFORCE2)	+= i2c-nforce2.o
 obj-$(CONFIG_I2C_NFORCE2_S4985)	+= i2c-nforce2-s4985.o
 obj-$(CONFIG_I2C_NVIDIA_GPU)	+= i2c-nvidia-gpu.o
diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c
new file mode 100644
index 000000000000..0462f0952043
--- /dev/null
+++ b/drivers/i2c/busses/i2c-nct6775.c
@@ -0,0 +1,647 @@
+/*
+ * i2c-nct6775 - Driver for the SMBus master functionality of
+ *	       Nuvoton NCT677x Super-I/O chips
+ *
+ * Copyright (C) 2019  Adam Honse <calcprogrammer1@gmail.com>
+ *
+ * Derived from nct6775 hwmon driver
+ * Copyright (C) 2012  Guenter Roeck <linux@roeck-us.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/platform_device.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/hwmon-vid.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/i2c.h>
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+#include <linux/nospec.h>
+
+#define DRVNAME "i2c-nct6775"
+
+/* Nuvoton SMBus address offsets */
+#define SMBHSTDAT       (0 + nuvoton_nct6793d_smba)
+#define SMBBLKSZ        (1 + nuvoton_nct6793d_smba)
+#define SMBHSTCMD       (2 + nuvoton_nct6793d_smba)
+#define SMBHSTIDX       (3 + nuvoton_nct6793d_smba)  //Index field is the Command field on other controllers
+#define SMBHSTCTL       (4 + nuvoton_nct6793d_smba)
+#define SMBHSTADD       (5 + nuvoton_nct6793d_smba)
+#define SMBHSTERR       (9 + nuvoton_nct6793d_smba)
+#define SMBHSTSTS       (0xE + nuvoton_nct6793d_smba)
+
+/* Command register */
+#define NCT6793D_READ_BYTE      0
+#define NCT6793D_READ_WORD      1
+#define NCT6793D_READ_BLOCK     2
+#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3
+#define NCT6793D_PROC_CALL      4
+#define NCT6793D_WRITE_BYTE     8
+#define NCT6793D_WRITE_WORD     9
+#define NCT6793D_WRITE_BLOCK    10
+
+/* Control register */
+#define NCT6793D_MANUAL_START   128
+#define NCT6793D_SOFT_RESET     64
+
+/* Error register */
+#define NCT6793D_NO_ACK         32
+
+/* Status register */
+#define NCT6793D_FIFO_EMPTY     1
+#define NCT6793D_FIFO_FULL      2
+#define NCT6793D_MANUAL_ACTIVE  4
+
+#define NCT6775_LD_SMBUS		0x0B
+
+/* Other settings */
+#define MAX_RETRIES		400
+
+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793,
+	     nct6795, nct6796, nct6798 };
+
+struct nct6775_sio_data {
+	int sioreg;
+	enum kinds kind;
+};
+
+/* used to set data->name = nct6775_device_names[data->sio_kind] */
+static const char * const nct6775_device_names[] = {
+	"nct6106",
+	"nct6775",
+	"nct6776",
+	"nct6779",
+	"nct6791",
+	"nct6792",
+	"nct6793",
+	"nct6795",
+	"nct6796",
+	"nct6798",
+};
+
+static const char * const nct6775_sio_names[] __initconst = {
+	"NCT6106D",
+	"NCT6775F",
+	"NCT6776D/F",
+	"NCT6779D",
+	"NCT6791D",
+	"NCT6792D",
+	"NCT6793D",
+	"NCT6795D",
+	"NCT6796D",
+	"NCT6798D",
+};
+
+#define SIO_REG_LDSEL		0x07	/* Logical device select */
+#define SIO_REG_DEVID		0x20	/* Device ID (2 bytes) */
+#define SIO_REG_SMBA		0x62	/* SMBus base address register */
+
+#define SIO_NCT6106_ID		0xc450
+#define SIO_NCT6775_ID		0xb470
+#define SIO_NCT6776_ID		0xc330
+#define SIO_NCT6779_ID		0xc560
+#define SIO_NCT6791_ID		0xc800
+#define SIO_NCT6792_ID		0xc910
+#define SIO_NCT6793_ID		0xd120
+#define SIO_NCT6795_ID		0xd350
+#define SIO_NCT6796_ID		0xd420
+#define SIO_NCT6798_ID		0xd428
+#define SIO_ID_MASK			0xFFF0
+
+static inline void
+superio_outb(int ioreg, int reg, int val)
+{
+	outb(reg, ioreg);
+	outb(val, ioreg + 1);
+}
+
+static inline int
+superio_inb(int ioreg, int reg)
+{
+	outb(reg, ioreg);
+	return inb(ioreg + 1);
+}
+
+static inline void
+superio_select(int ioreg, int ld)
+{
+	outb(SIO_REG_LDSEL, ioreg);
+	outb(ld, ioreg + 1);
+}
+
+static inline int
+superio_enter(int ioreg)
+{
+	/*
+	 * Try to reserve <ioreg> and <ioreg + 1> for exclusive access.
+	 */
+	if (!request_muxed_region(ioreg, 2, DRVNAME))
+		return -EBUSY;
+
+	outb(0x87, ioreg);
+	outb(0x87, ioreg);
+
+	return 0;
+}
+
+static inline void
+superio_exit(int ioreg)
+{
+	outb(0xaa, ioreg);
+	outb(0x02, ioreg);
+	outb(0x02, ioreg + 1);
+	release_region(ioreg, 2);
+}
+
+/*
+ * ISA constants
+ */
+
+#define IOREGION_ALIGNMENT	(~7)
+#define IOREGION_LENGTH		2
+#define ADDR_REG_OFFSET		0
+#define DATA_REG_OFFSET		1
+
+#define NCT6775_REG_BANK	0x4E
+#define NCT6775_REG_CONFIG	0x40
+
+static struct i2c_adapter *nct6775_adapter;
+
+struct i2c_nct6775_adapdata {
+	unsigned short smba;
+};
+
+/* Return negative errno on error. */
+static s32 nct6775_access(struct i2c_adapter * adap, u16 addr,
+		 unsigned short flags, char read_write,
+		 u8 command, int size, union i2c_smbus_data * data)
+{
+	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
+	unsigned short nuvoton_nct6793d_smba = adapdata->smba;
+	int i, len, cnt;
+	union i2c_smbus_data tmp_data;
+	int timeout = 0;
+
+	tmp_data.word = 0;
+	cnt = 0;
+	len = 0;
+
+	outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL);
+
+	switch (size) {
+		case I2C_SMBUS_QUICK:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			break;
+		case I2C_SMBUS_BYTE_DATA:
+			tmp_data.byte = data->byte;
+		case I2C_SMBUS_BYTE:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				outb_p(tmp_data.byte, SMBHSTDAT);
+				outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD);
+			}
+			else {
+				outb_p(NCT6793D_READ_BYTE, SMBHSTCMD);
+			}
+			break;
+		case I2C_SMBUS_WORD_DATA:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				outb_p(data->word & 0xff, SMBHSTDAT);
+				outb_p((data->word & 0xff00) >> 8, SMBHSTDAT);
+				outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD);
+			}
+			else {
+				outb_p(NCT6793D_READ_WORD, SMBHSTCMD);
+			}
+			break;
+		case I2C_SMBUS_BLOCK_DATA:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				len = data->block[0];
+				if (len == 0 || len > I2C_SMBUS_BLOCK_MAX)
+					return -EINVAL;
+				outb_p(len, SMBBLKSZ);
+
+				cnt = 1;
+				if (len >= 4) {
+					for (i = cnt; i <= 4; i++) {
+						outb_p(data->block[i], SMBHSTDAT);
+					}
+
+					len -= 4;
+					cnt += 4;
+				}
+				else {
+					for (i = cnt; i <= len; i++ ) {
+						outb_p(data->block[i], SMBHSTDAT);
+					}
+
+					len = 0;
+				}
+
+				outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD);
+			}
+			else {
+				return -ENOTSUPP;
+			}
+			break;
+		default:
+			dev_warn(&adap->dev, "Unsupported transaction %d\n", size);
+			return -EOPNOTSUPP;
+	}
+
+	outb_p(NCT6793D_MANUAL_START, SMBHSTCTL);
+
+	while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) {
+		if (read_write == I2C_SMBUS_WRITE) {
+			timeout = 0;
+			while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0)
+			{
+				if(timeout > MAX_RETRIES)
+				{
+					return -ETIMEDOUT;
+				}
+				usleep_range(250, 500);
+				timeout++;
+			}
+
+			//Load more bytes into FIFO
+			if (len >= 4) {
+				for (i = cnt; i <= (cnt + 4); i++) {
+					outb_p(data->block[i], SMBHSTDAT);
+				}
+
+				len -= 4;
+				cnt += 4;
+			}
+			else {
+				for (i = cnt; i <= (cnt + len); i++) {
+					outb_p(data->block[i], SMBHSTDAT);
+				}
+
+				len = 0;
+			}
+		}
+		else {
+			return -ENOTSUPP;
+		}
+		
+	}
+
+	//wait for manual mode to complete
+	timeout = 0;
+	while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0)
+	{
+		if(timeout > MAX_RETRIES)
+		{
+			return -ETIMEDOUT;
+		}
+		usleep_range(250, 500);
+		timeout++;
+	}
+
+	if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) {    	
+		return -ENXIO;
+	}
+	else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) {
+		return 0;
+	}
+
+	switch (size) {
+		case I2C_SMBUS_QUICK:
+		case I2C_SMBUS_BYTE_DATA:
+			data->byte = inb_p(SMBHSTDAT);
+			break;
+		case I2C_SMBUS_WORD_DATA:
+			data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8);
+			break;
+	}
+	return 0;
+}
+
+static u32 nct6775_func(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
+	    I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
+	    I2C_FUNC_SMBUS_BLOCK_DATA;
+}
+
+static const struct i2c_algorithm smbus_algorithm = {
+	.smbus_xfer	= nct6775_access,
+	.functionality	= nct6775_func,
+};
+
+static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap)
+{
+	struct i2c_adapter *adap;
+	struct i2c_nct6775_adapdata *adapdata;
+	int retval;
+
+	adap = kzalloc(sizeof(*adap), GFP_KERNEL);
+	if (adap == NULL) {
+		return -ENOMEM;
+	}
+
+	adap->owner = THIS_MODULE;
+	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
+	adap->algo = &smbus_algorithm;
+
+	adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL);
+	if (adapdata == NULL) {
+		kfree(adap);
+		return -ENOMEM;
+	}
+
+	adapdata->smba = smba;
+
+	snprintf(adap->name, sizeof(adap->name),
+		"SMBus NCT67xx adapter%s at %04x", name, smba);
+
+	i2c_set_adapdata(adap, adapdata);
+
+	retval = i2c_add_adapter(adap);
+	if (retval) {
+		kfree(adapdata);
+		kfree(adap);
+		return retval;
+	}
+
+	*padap = adap;
+	return 0;
+}
+
+static void nct6775_remove_adapter(struct i2c_adapter *adap)
+{
+	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
+
+	if (adapdata->smba) {
+		i2c_del_adapter(adap);
+		kfree(adapdata);
+		kfree(adap);
+	}
+}
+
+//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume);
+
+/*
+ * when Super-I/O functions move to a separate file, the Super-I/O
+ * bus will manage the lifetime of the device and this module will only keep
+ * track of the nct6775 driver. But since we use platform_device_alloc(), we
+ * must keep track of the device
+ */
+static struct platform_device *pdev[2];
+
+static int nct6775_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct nct6775_sio_data *sio_data = dev_get_platdata(dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH,
+				 DRVNAME))
+		return -EBUSY;
+
+	switch (sio_data->kind) {
+	case nct6791:
+	case nct6792:
+	case nct6793:
+	case nct6795:
+	case nct6796:
+	case nct6798:
+		nct6775_add_adapter(res->start, "", &nct6775_adapter);
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	return 0;
+}
+/*
+static void nct6791_enable_io_mapping(int sioaddr)
+{
+	int val;
+
+	val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE);
+	if (val & 0x10) {
+		pr_info("Enabling hardware monitor logical device mappings.\n");
+		superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE,
+			     val & ~0x10);
+	}
+}*/
+
+static struct platform_driver i2c_nct6775_driver = {
+	.driver = {
+		.name	= DRVNAME,
+//		.pm	= &nct6775_dev_pm_ops,
+	},
+	.probe		= nct6775_probe,
+};
+
+static void __exit i2c_nct6775_exit(void)
+{
+	int i;
+
+	if(nct6775_adapter)
+		nct6775_remove_adapter(nct6775_adapter);
+
+	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
+		if (pdev[i])
+			platform_device_unregister(pdev[i]);
+	}
+	platform_driver_unregister(&i2c_nct6775_driver);
+}
+
+/* nct6775_find() looks for a '627 in the Super-I/O config space */
+static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
+{
+	u16 val;
+	int err;
+	int addr;
+
+	err = superio_enter(sioaddr);
+	if (err)
+		return err;
+
+	val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) |
+		superio_inb(sioaddr, SIO_REG_DEVID + 1);
+
+	switch (val & SIO_ID_MASK) {
+	case SIO_NCT6106_ID:
+		sio_data->kind = nct6106;
+		break;
+	case SIO_NCT6775_ID:
+		sio_data->kind = nct6775;
+		break;
+	case SIO_NCT6776_ID:
+		sio_data->kind = nct6776;
+		break;
+	case SIO_NCT6779_ID:
+		sio_data->kind = nct6779;
+		break;
+	case SIO_NCT6791_ID:
+		sio_data->kind = nct6791;
+		break;
+	case SIO_NCT6792_ID:
+		sio_data->kind = nct6792;
+		break;
+	case SIO_NCT6793_ID:
+		sio_data->kind = nct6793;
+		break;
+	case SIO_NCT6795_ID:
+		sio_data->kind = nct6795;
+		break;
+	case SIO_NCT6796_ID:
+		sio_data->kind = nct6796;
+		break;
+	case SIO_NCT6798_ID:
+		sio_data->kind = nct6798;
+		break;
+	default:
+		if (val != 0xffff)
+			pr_debug("unsupported chip ID: 0x%04x\n", val);
+		superio_exit(sioaddr);
+		return -ENODEV;
+	}
+
+	/* We have a known chip, find the SMBus I/O address */
+	superio_select(sioaddr, NCT6775_LD_SMBUS);
+	val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8)
+	    | superio_inb(sioaddr, SIO_REG_SMBA + 1);
+	addr = val & IOREGION_ALIGNMENT;
+	if (addr == 0) {
+		pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n");
+		superio_exit(sioaddr);
+		return -ENODEV;
+	}
+
+	//if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
+	//    sio_data->kind == nct6793 || sio_data->kind == nct6795 ||
+	//    sio_data->kind == nct6796)
+	//	nct6791_enable_io_mapping(sioaddr);
+
+	superio_exit(sioaddr);
+	pr_info("Found %s or compatible chip at %#x:%#x\n",
+		nct6775_sio_names[sio_data->kind], sioaddr, addr);
+	sio_data->sioreg = sioaddr;
+
+	return addr;
+}
+
+static int __init i2c_nct6775_init(void)
+{
+	int i, err;
+	bool found = false;
+	int address;
+	struct resource res;
+	struct nct6775_sio_data sio_data;
+	int sioaddr[2] = { 0x2e, 0x4e };
+
+	err = platform_driver_register(&i2c_nct6775_driver);
+	if (err)
+		return err;
+
+	/*
+	 * initialize sio_data->kind and sio_data->sioreg.
+	 *
+	 * when Super-I/O functions move to a separate file, the Super-I/O
+	 * driver will probe 0x2e and 0x4e and auto-detect the presence of a
+	 * nct6775 hardware monitor, and call probe()
+	 */
+	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
+		address = nct6775_find(sioaddr[i], &sio_data);
+		if (address <= 0)
+			continue;
+
+		found = true;
+
+		pdev[i] = platform_device_alloc(DRVNAME, address);
+		if (!pdev[i]) {
+			err = -ENOMEM;
+			goto exit_device_unregister;
+		}
+
+		err = platform_device_add_data(pdev[i], &sio_data,
+					       sizeof(struct nct6775_sio_data));
+		if (err)
+			goto exit_device_put;
+
+		memset(&res, 0, sizeof(res));
+		res.name = DRVNAME;
+		res.start = address;
+		res.end = address + IOREGION_LENGTH - 1;
+		res.flags = IORESOURCE_IO;
+
+		err = acpi_check_resource_conflict(&res);
+		if (err) {
+			platform_device_put(pdev[i]);
+			pdev[i] = NULL;
+			continue;
+		}
+
+		err = platform_device_add_resources(pdev[i], &res, 1);
+		if (err)
+			goto exit_device_put;
+
+		/* platform_device_add calls probe() */
+		err = platform_device_add(pdev[i]);
+		if (err)
+			goto exit_device_put;
+	}
+	if (!found) {
+		err = -ENODEV;
+		goto exit_unregister;
+	}
+
+	return 0;
+
+exit_device_put:
+	platform_device_put(pdev[i]);
+exit_device_unregister:
+	while (--i >= 0) {
+		if (pdev[i])
+			platform_device_unregister(pdev[i]);
+	}
+exit_unregister:
+	platform_driver_unregister(&i2c_nct6775_driver);
+	return err;
+}
+
+MODULE_AUTHOR("Adam Honse <calcprogrammer1@gmail.com>");
+MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips");
+MODULE_LICENSE("GPL");
+
+module_init(i2c_nct6775_init);
+module_exit(i2c_nct6775_exit);
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 809fbd014cd6..d54b35b147ee 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter)
 	if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */
 		usleep_range(2000, 2100);
 	else
-		usleep_range(250, 500);
+		usleep_range(25, 50);
 
 	while ((++timeout < MAX_TIMEOUT) &&
 	       ((temp = inb_p(SMBHSTSTS)) & 0x01))
-		usleep_range(250, 500);
+		usleep_range(25, 50);
 
 	/* If the SMBus is still busy, we give up */
 	if (timeout == MAX_TIMEOUT) {
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3ba53dc3cc3f..0fde1b3ced78 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3213,6 +3213,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			goto bad;
 	}
 
+#ifdef CONFIG_CACHY
+	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+#endif
+
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
 	if (ret < 0)
 		goto bad;
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 37c8663de7fe..897d19f92ede 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
 obj-$(CONFIG_PCIE_CADENCE) += cadence/
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
new file mode 100644
index 000000000000..e105e6f5cc91
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+	struct pci_dev		*dev;		/* AHCI device */
+	struct pci_bus		*bus;		/* our fake PCI bus */
+	struct pci_sysdata	sysdata;
+	int			irq_base;	/* our fake interrupts */
+
+	/*
+	 * When we detect an all-ones write to a BAR register, this flag
+	 * is set, so that we return the BAR size on the next read (a
+	 * standard PCI behaviour).
+	 * This includes the assumption that an all-ones BAR write is
+	 * immediately followed by a read of the same register.
+	 */
+	bool			bar_sizing;
+
+	/*
+	 * Resources copied from the AHCI device, to be regarded as
+	 * resources on our fake bus.
+	 */
+	struct resource		ahci_resources[PCI_NUM_RESOURCES];
+
+	/* Resources corresponding to the NVMe devices. */
+	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
+
+	/* Number of remapped NVMe devices found. */
+	int			num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+		if (reg <= fixup_reg && fixup_reg < reg + len) \
+			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+	} while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+	} while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+		NR_FIX16(fixup_reg, (u16) fixed_value); \
+		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+				     int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+	int ret;
+
+	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+				      reg, len, value);
+	if (ret)
+		return ret;
+
+	/*
+	 * Adjust the device class, to prevent this driver from attempting to
+	 * additionally probe the device we're simulating here.
+	 */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+	/*
+	 * Unset interrupt pin, otherwise ACPI tries to find routing
+	 * info for our virtual IRQ, fails, and complains.
+	 */
+	NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+	/*
+	 * Truncate the AHCI BAR to not include the region that covers the
+	 * hidden devices. This will cause the ahci driver to successfully
+	 * probe th new device (instead of handing it over to this driver).
+	 */
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+		nrdev->bar_sizing = false;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+					int reg, int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct resource *remapped_mem;
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	*value = 0;
+	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+	/* Set a Vendor ID, otherwise Linux assumes no device is present */
+	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+	/* Always appear on & bus mastering */
+	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Set class so that nvme driver probes us */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_0,
+			 ~(resource_size(remapped_mem) - 1));
+		nrdev->bar_sizing = false;
+	} else {
+		resource_size_t mem_start = remapped_mem->start;
+
+		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+		mem_start >>= 32;
+		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+			       int reg, int len, u32 *value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_read_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+						    reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+				      int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+		/*
+		 * Writing all-ones to a BAR means that the size of the
+		 * memory region is being checked. Flag this so that we can
+		 * reply with an appropriate size on the next read.
+		 */
+		if (value == ~0)
+			nrdev->bar_sizing = true;
+
+		return ahci_dev_bus->ops->write(ahci_dev_bus,
+						nrdev->dev->devfn,
+						reg, len, value);
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+					 unsigned int port,
+					 int reg, int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Writing all-ones to a BAR means that the size of the memory
+	 * region is being checked. Flag this so that we can reply with
+	 * an appropriate size on the next read.
+	 */
+	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+			&& reg <= PCI_BASE_ADDRESS_5) {
+		nrdev->bar_sizing = true;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+				int reg, int len, u32 value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_write_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+						     reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+	.read	= nvme_remap_pci_read,
+	.write	= nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+	int domain = 0xffff;
+	struct pci_bus *bus = NULL;
+
+	while ((bus = pci_find_next_bus(bus)) != NULL)
+		domain = max_t(int, domain, pci_domain_nr(bus));
+
+	return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+				 struct list_head *resources)
+{
+	void __iomem *mmio;
+	int i, count = 0;
+	u32 cap;
+
+	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+			  pci_resource_len(nrdev->dev,
+					   AHCI_PCI_BAR_STANDARD));
+	if (!mmio)
+		return -ENODEV;
+
+	/* Check if this device might have remapped nvme devices. */
+	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+	    !(readl(mmio + AHCI_VSCAP) & 1))
+		return -ENODEV;
+
+	cap = readq(mmio + AHCI_REMAP_CAP);
+	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+		struct resource *remapped_mem;
+
+		if ((cap & (1 << i)) == 0)
+			continue;
+		if (readl(mmio + ahci_remap_dcc(i))
+				!= PCI_CLASS_STORAGE_EXPRESS)
+			continue;
+
+		/* We've found a remapped device */
+		remapped_mem = &nrdev->remapped_dev_mem[count++];
+		remapped_mem->start =
+			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+			+ ahci_remap_base(i);
+		remapped_mem->end = remapped_mem->start
+			+ AHCI_REMAP_N_SIZE - 1;
+		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+		pci_add_resource(resources, remapped_mem);
+	}
+
+	pcim_iounmap(nrdev->dev, mmio);
+
+	if (count == 0)
+		return -ENODEV;
+
+	nrdev->num_remapped_devices = count;
+	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+		 nrdev->num_remapped_devices);
+	return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+	struct pci_bus *bus = data;
+
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+			    const struct pci_device_id *id)
+{
+	struct nvme_remap_dev *nrdev;
+	LIST_HEAD(resources);
+	int i;
+	int ret;
+	struct pci_dev *child;
+
+	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+	nrdev->sysdata.domain = find_free_domain();
+	nrdev->sysdata.nvme_remap_dev = dev;
+	nrdev->dev = dev;
+	pci_set_drvdata(dev, nrdev);
+
+	ret = pcim_enable_device(dev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(dev);
+
+	ret = find_remapped_devices(nrdev, &resources);
+	if (ret)
+		return ret;
+
+	/* Add resources from the original AHCI device */
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		if (res->start) {
+			struct resource *nr_res = &nrdev->ahci_resources[i];
+
+			nr_res->start = res->start;
+			nr_res->end = res->end;
+			nr_res->flags = res->flags;
+			pci_add_resource(&resources, nr_res);
+		}
+	}
+
+	/* Create virtual interrupts */
+	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+					       nrdev->num_remapped_devices + 1,
+					       0);
+	if (nrdev->irq_base < 0)
+		return nrdev->irq_base;
+
+	/* Create and populate PCI bus */
+	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+					 &nrdev->sysdata, &resources);
+	if (!nrdev->bus)
+		return -ENODEV;
+
+	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+				     nrdev->bus))
+		return -ENOMEM;
+
+	/* We don't support sharing MSI interrupts between these devices */
+	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+	pci_scan_child_bus(nrdev->bus);
+
+	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+		/*
+		 * Prevent PCI core from trying to move memory BARs around.
+		 * The hidden NVMe devices are at fixed locations.
+		 */
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *res = &child->resource[i];
+
+			if (res->flags & IORESOURCE_MEM)
+				res->flags |= IORESOURCE_PCI_FIXED;
+		}
+
+		/* Share the legacy IRQ between all devices */
+		child->irq = dev->irq;
+	}
+
+	pci_assign_unassigned_bus_resources(nrdev->bus);
+	pci_bus_add_devices(nrdev->bus);
+
+	return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+	/*
+	 * Match all Intel RAID controllers.
+	 *
+	 * There's overlap here with the set of devices detected by the ahci
+	 * driver, but ahci will only successfully probe when there
+	 * *aren't* any remapped NVMe devices, and this driver will only
+	 * successfully probe when there *are* remapped NVMe devices that
+	 * need handling.
+	 */
+	{
+		PCI_VDEVICE(INTEL, PCI_ANY_ID),
+		.class = PCI_CLASS_STORAGE_RAID << 8,
+		.class_mask = 0xffffff00,
+	},
+	{0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+	.name		= MODULE_NAME,
+	.id_table	= nvme_remap_ids,
+	.probe		= nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c525867760bf..9232b0fcb945 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3625,6 +3625,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
 }
 
+static bool acs_on_downstream;
+static bool acs_on_multifunction;
+
+#define NUM_ACS_IDS 16
+struct acs_on_id {
+	unsigned short vendor;
+	unsigned short device;
+};
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
+static u8 max_acs_id;
+
+static __init int pcie_acs_override_setup(char *p)
+{
+	if (!p)
+		return -EINVAL;
+
+	while (*p) {
+		if (!strncmp(p, "downstream", 10))
+			acs_on_downstream = true;
+		if (!strncmp(p, "multifunction", 13))
+			acs_on_multifunction = true;
+		if (!strncmp(p, "id:", 3)) {
+			char opt[5];
+			int ret;
+			long val;
+
+			if (max_acs_id >= NUM_ACS_IDS - 1) {
+				pr_warn("Out of PCIe ACS override slots (%d)\n",
+						NUM_ACS_IDS);
+				goto next;
+			}
+
+			p += 3;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].vendor = val;
+
+			p += strcspn(p, ":");
+			if (*p != ':') {
+				pr_warn("PCIe ACS invalid ID\n");
+				goto next;
+			}
+
+			p++;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].device = val;
+			max_acs_id++;
+		}
+next:
+		p += strcspn(p, ",");
+		if (*p == ',')
+			p++;
+	}
+
+	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
+		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
+
+	return 0;
+}
+early_param("pcie_acs_override", pcie_acs_override_setup);
+
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
+{
+	int i;
+
+	/* Never override ACS for legacy devices or devices with ACS caps */
+	if (!pci_is_pcie(dev) ||
+		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
+			return -ENOTTY;
+
+	for (i = 0; i < max_acs_id; i++)
+		if (acs_on_ids[i].vendor == dev->vendor &&
+			acs_on_ids[i].device == dev->device)
+				return 1;
+
+	switch (pci_pcie_type(dev)) {
+	case PCI_EXP_TYPE_DOWNSTREAM:
+	case PCI_EXP_TYPE_ROOT_PORT:
+		if (acs_on_downstream)
+			return 1;
+		break;
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_UPSTREAM:
+	case PCI_EXP_TYPE_LEG_END:
+	case PCI_EXP_TYPE_RC_END:
+		if (acs_on_multifunction && dev->multifunction)
+			return 1;
+	}
+
+	return -ENOTTY;
+}
 /*
  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
  * prevented for those affected devices.
@@ -5015,6 +5115,7 @@ static const struct pci_dev_acs_enabled {
 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
 	/* Wangxun nics */
 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
+	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
 	{ 0 }
 };
 
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 4a01b315e0a9..e4a6c31a80df 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -641,6 +641,16 @@ config THINKPAD_LMI
 	  To compile this driver as a module, choose M here: the module will
 	  be called think-lmi.
 
+config LEGION_LAPTOP
+	tristate "Lenovo Legion Laptop Extras"
+	depends on ACPI
+	depends on ACPI_WMI || ACPI_WMI = n
+	depends on HWMON || HWMON = n
+	select ACPI_PLATFORM_PROFILE
+	help
+	  This is a driver for Lenovo Legion laptops and contains drivers for
+	  hotkey, fan control, and power mode.
+
 source "drivers/platform/x86/intel/Kconfig"
 
 config MSI_LAPTOP
@@ -1099,6 +1109,20 @@ config WINMATE_FM07_KEYS
 	  buttons below the display. This module adds an input device
 	  that delivers key events when these buttons are pressed.
 
+config STEAMDECK
+	tristate "Valve Steam Deck platform driver"
+	depends on X86_64
+	help
+	  Driver exposing various bits and pieces of functionality
+	  provided by Steam Deck specific VLV0100 device presented by
+	  EC firmware. This includes but not limited to:
+	     - CPU/device's fan control
+	     - Read-only access to DDIC registers
+	     - Battery tempreature measurements
+	     - Various display related control knobs
+	     - USB Type-C connector event notification
+	  Say N unless you are running on a Steam Deck.
+
 endif # X86_PLATFORM_DEVICES
 
 config P2SB
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 1d3d1b02541b..fde9a683103e 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_IDEAPAD_LAPTOP)	+= ideapad-laptop.o
 obj-$(CONFIG_SENSORS_HDAPS)	+= hdaps.o
 obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
 obj-$(CONFIG_THINKPAD_LMI)	+= think-lmi.o
+obj-$(CONFIG_LEGION_LAPTOP)	+= legion-laptop.o
 
 # Intel
 obj-y				+= intel/
@@ -134,3 +135,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC)	+= simatic-ipc.o
 
 # Winmate
 obj-$(CONFIG_WINMATE_FM07_KEYS)		+= winmate-fm07-keys.o
+
+# Steam Deck
+obj-$(CONFIG_STEAMDECK)			+= steamdeck.o
diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c
new file mode 100644
index 000000000000..d1268d239cc5
--- /dev/null
+++ b/drivers/platform/x86/legion-laptop.c
@@ -0,0 +1,2783 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  legion-laptop.c - Extra Lenovo Legion laptop support, in
+ *   particular for fan curve control and power mode.
+ *
+ *  Copyright (C) 2022 johnfan <johnfan (at) example (dot) com>
+ *
+ *
+ *  This driver might work on other Lenovo Legion models. If you
+ *  want to try it you can pass force=1 as argument
+ *  to the module which will force it to load even when the DMI
+ *  data doesn't match the model AND FIRMWARE.
+ *
+ *  Support for other hardware of this model is already partially
+ *  provided by the module ideapd-laptop.
+ *
+ *  The development page for this driver is located at
+ *  https://github.com/johnfanv2/LenovoLegionLinux
+ *
+ *  This driver exports the files:
+ *    - /sys/kernel/debug/legion/fancurve (ro)
+ *        The fan curve in the form stored in the firmware in an
+ *        human readable table.
+ *
+ *    - /sys/module/legion_laptop/drivers/platform\:legion/PNP0C09\:00/powermode (rw)
+ *       0: balanced mode (white)
+ *       1: performance mode (red)
+ *       2: quiet mode (blue)
+ *       ?: custom mode (pink)
+ *
+ *  NOTE: Writing to this will load the default fan curve from
+ *        the firmware for this mode, so the fan curve might
+ *        have to be reconfigured if needed.
+ *
+ *  It implements the usual hwmon interface to monitor fan speed and temmperature
+ *  and allows to set the fan curve inside the firware.
+ *
+ *    - /sys/class/hwmon/X/fan1_input or /sys/class/hwmon/X/fan2_input  (ro)
+ *        Current fan speed of fan1/fan2.
+ *    - /sys/class/hwmon/X/temp1_input (ro)
+ *    - /sys/class/hwmon/X/temp2_input (ro)
+ *    - /sys/class/hwmon/X/temp3_input (ro)
+ *        Temperature (Celsius) of CPU, GPU, and IC used for fan control.
+ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_pwm (rw)
+ *          PWM (0-255) of the fan at the Y-level in the fan curve
+ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_temp (rw)
+ *          upper temperature of tempZ (CPU, GPU, or IC) at the Y-level in the fan curve
+ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_temp_hyst (rw)
+ *          hysteris (CPU, GPU, or IC) at the Y-level in the fan curve. The lower
+ *          temperatue of the level is the upper temperature minus the hysteris
+ *
+ *
+ *  Credits for reverse engineering the firmware to:
+ *      - David Woodhouse: heavily inspired by lenovo_laptop.c
+ *      - Luke Cama: Windows version "LegionFanControl"
+ *      - SmokelessCPU: reverse engineering of custom registers in EC
+ *                      and commincation method with EC via ports
+ *      - 0x1F9F1: additional reverse engineering for complete fan curve
+ */
+
+#include <linux/acpi.h>
+#include <asm/io.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/platform_profile.h>
+#include <linux/types.h>
+#include <linux/wmi.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("johnfan");
+MODULE_DESCRIPTION("Lenovo Legion laptop extras");
+
+static bool force;
+module_param(force, bool, 0440);
+MODULE_PARM_DESC(
+	force,
+	"Force loading this module even if model or BIOS does not match.");
+
+static bool ec_readonly;
+module_param(ec_readonly, bool, 0440);
+MODULE_PARM_DESC(
+	ec_readonly,
+	"Only read from embedded controller but do not write or change settings.");
+
+#define LEGIONFEATURES \
+	"fancurve powermode platformprofile platformprofilenotify minifancurve"
+
+//Size of fancurve stored in embedded controller
+#define MAXFANCURVESIZE 10
+
+#define LEGION_DRVR_SHORTNAME "legion"
+#define LEGION_HWMON_NAME LEGION_DRVR_SHORTNAME "_hwmon"
+
+/* =============================== */
+/* Embedded Controller Description */
+/* =============================== */
+
+/* The configuration and registers to access the embedded controller
+ * depending on different the version of the software on the
+ * embedded controller or and the BIOS/UEFI firmware.
+ *
+ * To control fan curve in the embedded controller (EC) one has to
+ * write to its "RAM". There are different possibilities:
+ *  - EC RAM is memory mapped (write to it with ioremap)
+ *  - access EC RAM via ported mapped IO (outb/inb)
+ *  - access EC RAM via ACPI methods. It is only possible to write
+ *    to part of it (first 0xFF bytes?)
+ *
+ * In later models the firmware directly exposes ACPI methods to
+ * set the fan curve direclty, without writing to EC RAM. This
+ * is done inside the ACPI method.
+ */
+
+/**
+ * Offsets for interseting values inside the EC RAM  (0 = start of
+ * EC RAM. These might change depending on the software inside of
+ * the EC, which can be updated by a BIOS update from Lenovo.
+ */
+// TODO: same order as in initialization
+struct ec_register_offsets {
+	// Super I/O Configuration Registers
+	// 7.15 General Control (GCTRL)
+	// General Control (GCTRL)
+	// (see EC Interface Registers  and 6.2 Plug and Play Configuration (PNPCFG)) in datasheet
+	// note: these are in two places saved
+	// in EC Interface Registers  and in super io configuraion registers
+	// Chip ID
+	u16 ECHIPID1;
+	u16 ECHIPID2;
+	// Chip Version
+	u16 ECHIPVER;
+	u16 ECDEBUG;
+
+	// Lenovo Custom OEM extension
+	// Firmware of ITE can be extended by
+	// custom program using its own "variables"
+	// These are the offsets to these "variables"
+	u16 EXT_FAN_CUR_POINT;
+	u16 EXT_FAN_POINTS_SIZE;
+	u16 EXT_FAN1_BASE;
+	u16 EXT_FAN2_BASE;
+	u16 EXT_FAN_ACC_BASE;
+	u16 EXT_FAN_DEC_BASE;
+	u16 EXT_CPU_TEMP;
+	u16 EXT_CPU_TEMP_HYST;
+	u16 EXT_GPU_TEMP;
+	u16 EXT_GPU_TEMP_HYST;
+	u16 EXT_VRM_TEMP;
+	u16 EXT_VRM_TEMP_HYST;
+	u16 EXT_FAN1_RPM_LSB;
+	u16 EXT_FAN1_RPM_MSB;
+	u16 EXT_FAN2_RPM_LSB;
+	u16 EXT_FAN2_RPM_MSB;
+	u16 EXT_FAN1_TARGET_RPM;
+	u16 EXT_FAN2_TARGET_RPM;
+	u16 EXT_POWERMODE;
+	u16 EXT_MINIFANCURVE_ON_COOL;
+	// values
+	// 0x04: enable mini fan curve if very long on cool level
+	//      - this might be due to potential temp failure
+	//      - or just because really so cool
+	// 0xA0: disable it
+	u16 EXT_LOCKFANCONTROLLER;
+	u16 EXT_MAXIMUMFANSPEED;
+	u16 EXT_WHITE_KEYBOARD_BACKLIGHT;
+	u16 EXT_IC_TEMP_INPUT;
+	u16 EXT_CPU_TEMP_INPUT;
+	u16 EXT_GPU_TEMP_INPUT;
+};
+
+struct model_config {
+	const struct ec_register_offsets *registers;
+	bool check_embedded_controller_id;
+	u16 embedded_controller_id;
+
+	// first addr in EC we access/scan
+	phys_addr_t memoryio_physical_ec_start;
+	size_t memoryio_size;
+
+	// TODO: maybe use bitfield
+	bool has_minifancurve;
+};
+
+/* =================================== */
+/* Coinfiguration for different models */
+/* =================================== */
+
+// Idea by SmokelesssCPU (modified)
+// - all default names and register addresses are supported by datasheet
+// - register addresses for custom firmware by SmokelesssCPU
+static const struct ec_register_offsets ec_register_offsets_v0 = {
+	.ECHIPID1 = 0x2000,
+	.ECHIPID2 = 0x2001,
+	.ECHIPVER = 0x2002,
+	.ECDEBUG = 0x2003,
+	.EXT_FAN_CUR_POINT = 0xC534,
+	.EXT_FAN_POINTS_SIZE = 0xC535,
+	.EXT_FAN1_BASE = 0xC540,
+	.EXT_FAN2_BASE = 0xC550,
+	.EXT_FAN_ACC_BASE = 0xC560,
+	.EXT_FAN_DEC_BASE = 0xC570,
+	.EXT_CPU_TEMP = 0xC580,
+	.EXT_CPU_TEMP_HYST = 0xC590,
+	.EXT_GPU_TEMP = 0xC5A0,
+	.EXT_GPU_TEMP_HYST = 0xC5B0,
+	.EXT_VRM_TEMP = 0xC5C0,
+	.EXT_VRM_TEMP_HYST = 0xC5D0,
+	.EXT_FAN1_RPM_LSB = 0xC5E0,
+	.EXT_FAN1_RPM_MSB = 0xC5E1,
+	.EXT_FAN2_RPM_LSB = 0xC5E2,
+	.EXT_FAN2_RPM_MSB = 0xC5E3,
+	.EXT_MINIFANCURVE_ON_COOL = 0xC536,
+	.EXT_LOCKFANCONTROLLER = 0xc4AB,
+	.EXT_CPU_TEMP_INPUT = 0xc538,
+	.EXT_GPU_TEMP_INPUT = 0xc539,
+	.EXT_IC_TEMP_INPUT = 0xC5E8,
+	.EXT_POWERMODE = 0xc420,
+	.EXT_FAN1_TARGET_RPM = 0xc600,
+	.EXT_FAN2_TARGET_RPM = 0xc601,
+	.EXT_MAXIMUMFANSPEED = 0xBD,
+	.EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400)
+};
+
+static const struct model_config model_v0 = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true
+};
+
+static const struct model_config model_kfcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false
+};
+
+static const struct model_config model_hacn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false
+};
+
+
+static const struct model_config model_k9cn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400, // or replace 0xC400 by 0x0400  ?
+	.memoryio_size = 0x300,
+	.has_minifancurve = false
+};
+
+
+
+static const struct dmi_system_id denylist[] = { {} };
+
+static const struct dmi_system_id optimistic_allowlist[] = {
+	{
+		// modelyear: 2021
+		// generation: 6
+		// name: Legion 5, Legion 5 pro, Legion 7
+		// Family: Legion 5 15ACH6H, ...
+		.ident = "GKCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "GKCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2020
+		.ident = "EUCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "EUCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2020
+		.ident = "EFCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "EFCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2020
+		.ident = "FSCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "FSCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2021
+		.ident = "HHCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "HHCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "H1CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "H1CN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "J2CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "J2CN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "JUCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "JUCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "KFCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "KFCN"),
+		},
+		.driver_data = (void *)&model_kfcn
+	},
+	{
+		// modelyear: 2021
+		.ident = "HACN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "HACN"),
+		},
+		.driver_data = (void *)&model_hacn
+	},
+	{
+		// modelyear: 2021
+		.ident = "G9CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "G9CN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "K9CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "K9CN"),
+		},
+		.driver_data = (void *)&model_k9cn
+	},
+	{}
+};
+
+/* ================================= */
+/* ACPI access                       */
+/* ================================= */
+
+// function from ideapad-laptop.c
+static int eval_int(acpi_handle handle, const char *name, unsigned long *res)
+{
+	unsigned long long result;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(handle, (char *)name, NULL, &result);
+	if (ACPI_FAILURE(status))
+		return -EIO;
+
+	*res = result;
+
+	return 0;
+}
+
+// function from ideapad-laptop.c
+static int exec_simple_method(acpi_handle handle, const char *name,
+			      unsigned long arg)
+{
+	acpi_status status =
+		acpi_execute_simple_method(handle, (char *)name, arg);
+
+	return ACPI_FAILURE(status) ? -EIO : 0;
+}
+
+// function from ideapad-laptop.c
+static int exec_sbmc(acpi_handle handle, unsigned long arg)
+{
+	// \_SB.PCI0.LPC0.EC0.VPC0.SBMC
+	return exec_simple_method(handle, "SBMC", arg);
+}
+
+static int eval_qcho(acpi_handle handle, unsigned long *res)
+{
+	// \_SB.PCI0.LPC0.EC0.QCHO
+	return eval_int(handle, "QCHO", res);
+}
+
+/* ================================= */
+/* EC RAM Access with port-mapped IO */
+/* ================================= */
+
+/*
+ * See datasheet of e.g. IT8502E/F/G, e.g.
+ * 6.2 Plug and Play Configuration (PNPCFG)
+ *
+ * Depending on configured BARDSEL register
+ * the ports
+ *   ECRAM_PORTIO_ADDR_PORT and
+ *   ECRAM_PORTIO_DATA_PORT
+ * are configured.
+ *
+ * By performing IO on these ports one can
+ * read/write to registers in the EC.
+ *
+ * "To access a register of PNPCFG, write target index to
+ *  address port and access this PNPCFG register via
+ *  data port" [datasheet, 6.2 Plug and Play Configuration]
+ */
+
+// IO ports used to write to communicate with embedded controller
+// Start of used ports
+#define ECRAM_PORTIO_START_PORT 0x4E
+// Number of used ports
+#define ECRAM_PORTIO_PORTS_SIZE 2
+// Port used to specify address in EC RAM to read/write
+// 0x4E/0x4F is the usual port for IO super controler
+// 0x2E/0x2F also common (ITE can also be configure to use these)
+#define ECRAM_PORTIO_ADDR_PORT 0x4E
+// Port to send/receive the value  to write/read
+#define ECRAM_PORTIO_DATA_PORT 0x4F
+// Name used to request ports
+#define ECRAM_PORTIO_NAME "legion"
+
+struct ecram_portio {
+	/* protects read/write to EC RAM performed
+	 * as a certain sequence of outb, inb
+	 * commands on the IO ports. There can
+	 * be at most one.
+	 */
+	struct mutex io_port_mutex;
+};
+
+ssize_t ecram_portio_init(struct ecram_portio *ec_portio)
+{
+	if (!request_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE,
+			    ECRAM_PORTIO_NAME)) {
+		pr_info("Cannot init ecram_portio the %x ports starting at %x\n",
+			ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
+		return -ENODEV;
+	}
+	//pr_info("Reserved %x ports starting at %x\n", ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
+	mutex_init(&ec_portio->io_port_mutex);
+	return 0;
+}
+
+void ecram_portio_exit(struct ecram_portio *ec_portio)
+{
+	release_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE);
+}
+
+/* Read a byte from the EC RAM.
+ *
+ * Return status because of commong signature for alle
+ * methods to access EC RAM.
+ */
+ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset, u8 *value)
+{
+	mutex_lock(&ec_portio->io_port_mutex);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x11, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	// TODO: no explicit cast between types seems to be sometimes
+	// done and sometimes not
+	outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x10, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x12, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	*value = inb(ECRAM_PORTIO_DATA_PORT);
+
+	mutex_unlock(&ec_portio->io_port_mutex);
+	return 0;
+}
+
+/* Write a byte to the EC RAM.
+ *
+ * Return status because of commong signature for alle
+ * methods to access EC RAM.
+ */
+ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset, u8 value)
+{
+	mutex_lock(&ec_portio->io_port_mutex);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x11, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	// TODO: no explicit cast between types seems to be sometimes
+	// done and sometimes not
+	outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x10, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x12, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	outb(value, ECRAM_PORTIO_DATA_PORT);
+
+	mutex_unlock(&ec_portio->io_port_mutex);
+	return 0;
+}
+
+/* =================================== */
+/* EC RAM Access                       */
+/* =================================== */
+
+struct ecram {
+	struct ecram_portio portio;
+};
+
+ssize_t ecram_init(struct ecram *ecram, phys_addr_t memoryio_ec_physical_start,
+		   size_t region_size)
+{
+	ssize_t err;
+
+	err = ecram_portio_init(&ecram->portio);
+	if (err) {
+		pr_info("Failed ecram_portio_init\n");
+		goto err_ecram_portio_init;
+	}
+
+	return 0;
+
+err_ecram_portio_init:
+	return err;
+}
+
+void ecram_exit(struct ecram *ecram)
+{
+	pr_info("Unloading legion ecram\n");
+	ecram_portio_exit(&ecram->portio);
+	pr_info("Unloading legion ecram done\n");
+}
+
+/**
+ * ecram_offset address on the EC
+ */
+static u8 ecram_read(struct ecram *ecram, u16 ecram_offset)
+{
+	u8 value;
+	int err;
+
+	err = ecram_portio_read(&ecram->portio, ecram_offset, &value);
+	if (err)
+		pr_info("Error reading EC RAM at 0x%x\n", ecram_offset);
+	return value;
+}
+
+static void ecram_write(struct ecram *ecram, u16 ecram_offset, u8 value)
+{
+	int err;
+
+	if (ec_readonly) {
+		pr_info("Skipping writing EC RAM at 0x%x because readonly.\n",
+			ecram_offset);
+		return;
+	}
+	err = ecram_portio_write(&ecram->portio, ecram_offset, value);
+	if (err)
+		pr_info("Error writing EC RAM at 0x%x\n", ecram_offset);
+}
+
+/* =============================== */
+/* Reads from EC  */
+/* ===============================  */
+
+u16 read_ec_id(struct ecram *ecram, const struct model_config *model)
+{
+	u8 id1 = ecram_read(ecram, model->registers->ECHIPID1);
+	u8 id2 = ecram_read(ecram, model->registers->ECHIPID2);
+
+	return (id1 << 8) + id2;
+}
+
+u16 read_ec_version(struct ecram *ecram, const struct model_config *model)
+{
+	u8 vers = ecram_read(ecram, model->registers->ECHIPVER);
+	u8 debug = ecram_read(ecram, model->registers->ECDEBUG);
+
+	return (vers << 8) + debug;
+}
+
+/* ============================= */
+/* Data model for sensor values  */
+/* ============================  */
+
+struct sensor_values {
+	u16 fan1_rpm; // current speed in rpm of fan 1
+	u16 fan2_rpm; // current speed in rpm of fan2
+	u16 fan1_target_rpm; // target speed in rpm of fan 1
+	u16 fan2_target_rpm; // target speed in rpm of fan 2
+	u8 cpu_temp_celsius; // cpu temperature in celcius
+	u8 gpu_temp_celsius; // gpu temperature in celcius
+	u8 ic_temp_celsius; // ic temperature in celcius
+};
+
+enum SENSOR_ATTR {
+	SENSOR_CPU_TEMP_ID = 1,
+	SENSOR_GPU_TEMP_ID = 2,
+	SENSOR_IC_TEMP_ID = 3,
+	SENSOR_FAN1_RPM_ID = 4,
+	SENSOR_FAN2_RPM_ID = 5,
+	SENSOR_FAN1_TARGET_RPM_ID = 6,
+	SENSOR_FAN2_TARGET_RPM_ID = 7
+};
+
+static int read_sensor_values(struct ecram *ecram,
+			      const struct model_config *model,
+			      struct sensor_values *values)
+{
+	values->fan1_target_rpm =
+		100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM);
+	values->fan2_target_rpm =
+		100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM);
+
+	values->fan1_rpm =
+		ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) +
+		(((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB))
+		 << 8);
+	values->fan2_rpm =
+		ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) +
+		(((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB))
+		 << 8);
+
+	values->cpu_temp_celsius =
+		ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT);
+	values->gpu_temp_celsius =
+		ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT);
+	values->ic_temp_celsius =
+		ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT);
+
+	values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6);
+	values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7);
+	values->ic_temp_celsius = ecram_read(ecram, 0xC5E8);
+
+	return 0;
+}
+
+/* =============================== */
+/* Behaviour changing functions    */
+/* =============================== */
+
+int read_powermode(struct ecram *ecram, const struct model_config *model)
+{
+	return ecram_read(ecram, model->registers->EXT_POWERMODE);
+}
+
+ssize_t write_powermode(struct ecram *ecram, const struct model_config *model,
+			u8 value)
+{
+	if (!(value >= 0 && value <= 2)) {
+		pr_info("Unexpected power mode value ignored: %d\n", value);
+		return -ENOMEM;
+	}
+	ecram_write(ecram, model->registers->EXT_POWERMODE, value);
+	return 0;
+}
+
+/**
+ * Shortly toggle powermode to a different mode
+ * and switch back, e.g. to reset fan curve.
+ */
+void toggle_powermode(struct ecram *ecram, const struct model_config *model)
+{
+	int old_powermode = read_powermode(ecram, model);
+	int next_powermode = old_powermode == 0 ? 1 : 0;
+
+	write_powermode(ecram, model, next_powermode);
+	mdelay(1500);
+	write_powermode(ecram, model, old_powermode);
+}
+
+#define lockfancontroller_ON 8
+#define lockfancontroller_OFF 0
+
+ssize_t write_lockfancontroller(struct ecram *ecram,
+				const struct model_config *model, bool state)
+{
+	u8 val = state ? lockfancontroller_ON : lockfancontroller_OFF;
+
+	ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val);
+	return 0;
+}
+
+int read_lockfancontroller(struct ecram *ecram,
+			   const struct model_config *model, bool *state)
+{
+	int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER);
+
+	switch (value) {
+	case lockfancontroller_ON:
+		*state = true;
+		break;
+	case lockfancontroller_OFF:
+		*state = false;
+		break;
+	default:
+		pr_info("Unexpected value in lockfanspeed register:%d\n",
+			value);
+		return -1;
+	}
+	return 0;
+}
+
+#define MAXIMUMFANSPEED_ON 0x40
+#define MAXIMUMFANSPEED_OFF 0x00
+
+int read_maximumfanspeed(struct ecram *ecram, const struct model_config *model,
+			 bool *state)
+{
+	int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED);
+
+	switch (value) {
+	case MAXIMUMFANSPEED_ON:
+		*state = true;
+		break;
+	case MAXIMUMFANSPEED_OFF:
+		*state = false;
+		break;
+	default:
+		pr_info("Unexpected value in maximumfanspeed register:%d\n",
+			value);
+		return -1;
+	}
+	return 0;
+}
+
+ssize_t write_maximumfanspeed(struct ecram *ecram,
+			      const struct model_config *model, bool state)
+{
+	u8 val = state ? MAXIMUMFANSPEED_ON : MAXIMUMFANSPEED_OFF;
+
+	ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val);
+	return 0;
+}
+
+#define MINIFANCUVE_ON_COOL_ON 0x04
+#define MINIFANCUVE_ON_COOL_OFF 0xA0
+
+int read_minifancurve(struct ecram *ecram, const struct model_config *model,
+		      bool *state)
+{
+	int value =
+		ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL);
+
+	switch (value) {
+	case MINIFANCUVE_ON_COOL_ON:
+		*state = true;
+		break;
+	case MINIFANCUVE_ON_COOL_OFF:
+		*state = false;
+		break;
+	default:
+		pr_info("Unexpected value in MINIFANCURVE register:%d\n",
+			value);
+		return -1;
+	}
+	return 0;
+}
+
+ssize_t write_minifancurve(struct ecram *ecram,
+			   const struct model_config *model, bool state)
+{
+	u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF;
+
+	ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val);
+	return 0;
+}
+
+#define KEYBOARD_BACKLIGHT_OFF 18
+#define KEYBOARD_BACKLIGHT_ON1 21
+#define KEYBOARD_BACKLIGHT_ON2 23
+
+int read_keyboard_backlight(struct ecram *ecram,
+			    const struct model_config *model, int *state)
+{
+	int value = ecram_read(ecram,
+			       model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT);
+
+	//switch (value) {
+	//case MINIFANCUVE_ON_COOL_ON:
+	//	*state = true;
+	//	break;
+	//case MINIFANCUVE_ON_COOL_OFF:
+	//	*state = false;
+	//	break;
+	//default:
+	//	pr_info("Unexpected value in MINIFANCURVE register:%d\n",
+	//		value);
+	//	return -1;
+	//}
+	*state = value;
+	return 0;
+}
+
+int write_keyboard_backlight(struct ecram *ecram,
+			     const struct model_config *model, int state)
+{
+	u8 val = state > 0 ? KEYBOARD_BACKLIGHT_ON1 : KEYBOARD_BACKLIGHT_OFF;
+
+	ecram_write(ecram, model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT, val);
+	return 0;
+}
+
+#define FCT_RAPID_CHARGE_ON 0x07
+#define FCT_RAPID_CHARGE_OFF 0x08
+#define RAPID_CHARGE_ON 0x0
+#define RAPID_CHARGE_OFF 0x1
+
+int read_rapidcharge(acpi_handle acpihandle, int *state)
+{
+	unsigned long result;
+	int err;
+
+	err = eval_qcho(acpihandle, &result);
+	if (err)
+		return err;
+
+	*state = result;
+	return 0;
+}
+
+int write_rapidcharge(acpi_handle acpihandle, bool state)
+{
+	unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON :
+					   FCT_RAPID_CHARGE_OFF;
+	return exec_sbmc(acpihandle, fct_nr);
+}
+
+/* ============================= */
+/* Data model for fan curve      */
+/* ============================  */
+
+struct fancurve_point {
+	// rpm1 devided by 100
+	u8 rpm1_raw;
+	// rpm2 devided by 100
+	u8 rpm2_raw;
+	// >=2 , <=5 (lower is faster); must be increasing by level
+	u8 accel;
+	// >=2 , <=5 (lower is faster); must be increasing by level
+	u8 decel;
+
+	// min must be lower or equal than max
+	// last level max must be 127
+	// <=127 cpu max temp for this level; must be increasing by level
+	u8 cpu_max_temp_celsius;
+	// <=127 cpu min temp for this level; must be increasing by level
+	u8 cpu_min_temp_celsius;
+	// <=127 gpu min temp for this level; must be increasing by level
+	u8 gpu_max_temp_celsius;
+	// <=127 gpu max temp for this level; must be increasing by level
+	u8 gpu_min_temp_celsius;
+	// <=127 ic max temp for this level; must be increasing by level
+	u8 ic_max_temp_celsius;
+	// <=127 ic max temp for this level; must be increasing by level
+	u8 ic_min_temp_celsius;
+};
+
+enum FANCURVE_ATTR {
+	FANCURVE_ATTR_PWM1 = 1,
+	FANCURVE_ATTR_PWM2 = 2,
+	FANCURVE_ATTR_CPU_TEMP = 3,
+	FANCURVE_ATTR_CPU_HYST = 4,
+	FANCURVE_ATTR_GPU_TEMP = 5,
+	FANCURVE_ATTR_GPU_HYST = 6,
+	FANCURVE_ATTR_IC_TEMP = 7,
+	FANCURVE_ATTR_IC_HYST = 8,
+	FANCURVE_ATTR_ACCEL = 9,
+	FANCURVE_ATTR_DECEL = 10,
+	FANCURVE_SIZE = 11,
+	FANCURVE_MINIFANCURVE_ON_COOL = 12
+};
+
+// used for clearing table entries
+static const struct fancurve_point fancurve_point_zero = { 0, 0, 0, 0, 0,
+							   0, 0, 0, 0, 0 };
+
+struct fancurve {
+	struct fancurve_point points[MAXFANCURVESIZE];
+	// number of points used; must be <= MAXFANCURVESIZE
+	size_t size;
+	// the point that at which fans are run currently
+	size_t current_point_i;
+};
+
+// calculate derived values
+
+int fancurve_get_cpu_deltahyst(struct fancurve_point *point)
+{
+	return ((int)point->cpu_max_temp_celsius) -
+	       ((int)point->cpu_min_temp_celsius);
+}
+
+int fancurve_get_gpu_deltahyst(struct fancurve_point *point)
+{
+	return ((int)point->gpu_max_temp_celsius) -
+	       ((int)point->gpu_min_temp_celsius);
+}
+
+int fancurve_get_ic_deltahyst(struct fancurve_point *point)
+{
+	return ((int)point->ic_max_temp_celsius) -
+	       ((int)point->ic_min_temp_celsius);
+}
+
+// validation functions
+
+bool fancurve_is_valid_min_temp(int min_temp)
+{
+	return min_temp >= 0 && min_temp <= 127;
+}
+
+bool fancurve_is_valid_max_temp(int max_temp)
+{
+	return max_temp >= 0 && max_temp <= 127;
+}
+
+// setters with validation
+// - make hwmon implementation easier
+// - keep fancurve valid, otherwise EC will not properly control fan
+
+bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm)
+{
+	bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
+
+	if (valid)
+		fancurve->points[point_id].rpm1_raw = rpm / 100;
+	return valid;
+}
+
+bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm)
+{
+	bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
+
+	if (valid)
+		fancurve->points[point_id].rpm2_raw = rpm / 100;
+	return valid;
+}
+
+// TODO: remove { ... } from single line if body
+
+bool fancurve_set_accel(struct fancurve *fancurve, int point_id, int accel)
+{
+	bool valid = accel >= 2 && accel <= 5;
+
+	if (valid)
+		fancurve->points[point_id].accel = accel;
+	return valid;
+}
+
+bool fancurve_set_decel(struct fancurve *fancurve, int point_id, int decel)
+{
+	bool valid = decel >= 2 && decel <= 5;
+
+	if (valid)
+		fancurve->points[point_id].decel = decel;
+	return valid;
+}
+
+bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id,
+			       int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].cpu_max_temp_celsius = value;
+
+	return valid;
+}
+
+bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id,
+			       int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].gpu_max_temp_celsius = value;
+	return valid;
+}
+
+bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id,
+			      int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].ic_max_temp_celsius = value;
+	return valid;
+}
+
+bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id,
+			       int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].cpu_min_temp_celsius = value;
+	return valid;
+}
+
+bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id,
+			       int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].gpu_min_temp_celsius = value;
+	return valid;
+}
+
+bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id,
+			      int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].ic_min_temp_celsius = value;
+	return valid;
+}
+
+bool fancurve_set_size(struct fancurve *fancurve, int size, bool init_values)
+{
+	bool valid = size >= 1 && size <= MAXFANCURVESIZE;
+
+	if (!valid)
+		return false;
+	if (init_values && size < fancurve->size) {
+		// fancurve size is decreased, but last etnry alwasy needs 127 temperatures
+		// Note: size >=1
+		fancurve->points[size - 1].cpu_max_temp_celsius = 127;
+		fancurve->points[size - 1].ic_max_temp_celsius = 127;
+		fancurve->points[size - 1].gpu_max_temp_celsius = 127;
+	}
+	if (init_values && size > fancurve->size) {
+		// fancurve increased, so new entries need valid values
+		int i;
+		int last = fancurve->size > 0 ? fancurve->size - 1 : 0;
+
+		for (i = fancurve->size; i < size; ++i)
+			fancurve->points[i] = fancurve->points[last];
+	}
+	return true;
+}
+
+/* Read the fan curve from the EC.
+ *
+ * In newer models (>=2022) there is an ACPI/WMI to read fan curve as
+ * a whole. So read/write fan table as a whole to use
+ * same interface for both cases.
+ *
+ * It reads all points from EC memory, even if stored fancurve is smaller, so
+ * it can contain 0 entries.
+ */
+static int read_fancurve(struct ecram *ecram, const struct model_config *model,
+			 struct fancurve *fancurve)
+{
+	size_t i = 0;
+
+	for (i = 0; i < MAXFANCURVESIZE; ++i) {
+		struct fancurve_point *point = &fancurve->points[i];
+
+		point->rpm1_raw =
+			ecram_read(ecram, model->registers->EXT_FAN1_BASE + i);
+		point->rpm2_raw =
+			ecram_read(ecram, model->registers->EXT_FAN2_BASE + i);
+
+		point->accel = ecram_read(
+			ecram, model->registers->EXT_FAN_ACC_BASE + i);
+		point->decel = ecram_read(
+			ecram, model->registers->EXT_FAN_DEC_BASE + i);
+		point->cpu_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_CPU_TEMP + i);
+		point->cpu_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_CPU_TEMP_HYST + i);
+		point->gpu_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_GPU_TEMP + i);
+		point->gpu_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_GPU_TEMP_HYST + i);
+		point->ic_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_VRM_TEMP + i);
+		point->ic_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_VRM_TEMP_HYST + i);
+	}
+
+	// Do not trust that hardware; It might suddendly report
+	// a larger size, so clamp it.
+	fancurve->size =
+		ecram_read(ecram, model->registers->EXT_FAN_POINTS_SIZE);
+	fancurve->size =
+		min(fancurve->size, (typeof(fancurve->size))(MAXFANCURVESIZE));
+	fancurve->current_point_i =
+		ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT);
+	fancurve->current_point_i =
+		min(fancurve->current_point_i, fancurve->size);
+	return 0;
+}
+
+static int write_fancurve(struct ecram *ecram, const struct model_config *model,
+			  const struct fancurve *fancurve, bool write_size)
+{
+	size_t i;
+	// Reset fan update counters (try to avoid any race conditions)
+	ecram_write(ecram, 0xC5FE, 0);
+	ecram_write(ecram, 0xC5FF, 0);
+	for (i = 0; i < MAXFANCURVESIZE; ++i) {
+		// Entries for points larger than fancurve size should be cleared
+		// to 0
+		const struct fancurve_point *point =
+			i < fancurve->size ? &fancurve->points[i] :
+					     &fancurve_point_zero;
+
+		ecram_write(ecram, model->registers->EXT_FAN1_BASE + i,
+			    point->rpm1_raw);
+		ecram_write(ecram, model->registers->EXT_FAN2_BASE + i,
+			    point->rpm2_raw);
+
+		ecram_write(ecram, model->registers->EXT_FAN_ACC_BASE + i,
+			    point->accel);
+		ecram_write(ecram, model->registers->EXT_FAN_DEC_BASE + i,
+			    point->decel);
+
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP + i,
+			    point->cpu_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i,
+			    point->cpu_min_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP + i,
+			    point->gpu_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i,
+			    point->gpu_min_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_VRM_TEMP + i,
+			    point->ic_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_VRM_TEMP_HYST + i,
+			    point->ic_min_temp_celsius);
+	}
+
+	if (write_size) {
+		ecram_write(ecram, model->registers->EXT_FAN_POINTS_SIZE,
+			    fancurve->size);
+	}
+
+	// Reset current fan level to 0, so algorithm in EC
+	// selects fan curve point again and resetting hysterisis
+	// effects
+	ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0);
+
+	// Reset internal fan levels
+	ecram_write(ecram, 0xC634, 0); // CPU
+	ecram_write(ecram, 0xC635, 0); // GPU
+	ecram_write(ecram, 0xC636, 0); // SENSOR
+
+	return 0;
+}
+
+static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve,
+				      struct seq_file *s)
+{
+	int i;
+
+	seq_printf(
+		s,
+		"rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n");
+	for (i = 0; i < fancurve->size; ++i) {
+		const struct fancurve_point *point = &fancurve->points[i];
+
+		seq_printf(
+			s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n",
+			point->rpm1_raw * 100, point->rpm2_raw * 100,
+			point->accel, point->decel, point->cpu_min_temp_celsius,
+			point->cpu_max_temp_celsius,
+			point->gpu_min_temp_celsius,
+			point->gpu_max_temp_celsius, point->ic_min_temp_celsius,
+			point->ic_max_temp_celsius);
+	}
+	return 0;
+}
+
+/* =============================  */
+/* Global and shared data between */
+/* all calls to this module       */
+/* ============================   */
+// Implemented like ideapad-laptop.c but currenlty still
+// wihtout dynamic memory allocation (instaed global _priv)
+
+struct legion_private {
+	struct platform_device *platform_device;
+	// TODO: remove or keep? init?
+	// struct acpi_device *adev;
+
+	// Method to access ECRAM
+	struct ecram ecram;
+	// Configuration with registers an ECRAM access method
+	const struct model_config *conf;
+
+	// TODO: maybe refactor an keep only local to each function
+	// last known fan curve
+	struct fancurve fancurve;
+	// configured fan curve from user space
+	struct fancurve fancurve_configured;
+
+	// update lock, when partial values of fancurve are changed
+	struct mutex fancurve_mutex;
+
+	//interfaces
+	struct dentry *debugfs_dir;
+	struct device *hwmon_dev;
+	struct platform_profile_handler platform_profile_handler;
+
+	// TODO: remove?
+	bool loaded;
+};
+
+// shared between different drivers: WMI, platform and proteced by mutex
+static struct legion_private *legion_shared;
+static struct legion_private _priv;
+static DEFINE_MUTEX(legion_shared_mutex);
+
+static int legion_shared_init(struct legion_private *priv)
+{
+	int ret;
+
+	mutex_lock(&legion_shared_mutex);
+
+	if (!legion_shared) {
+		legion_shared = priv;
+		mutex_init(&legion_shared->fancurve_mutex);
+		ret = 0;
+	} else {
+		pr_warn("Found multiple platform devices\n");
+		ret = -EINVAL;
+	}
+
+	priv->loaded = true;
+	mutex_unlock(&legion_shared_mutex);
+
+	return ret;
+}
+
+static void legion_shared_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion shared\n");
+	mutex_lock(&legion_shared_mutex);
+
+	if (legion_shared == priv)
+		legion_shared = NULL;
+
+	mutex_unlock(&legion_shared_mutex);
+	pr_info("Unloading legion shared done\n");
+}
+
+/* =============================  */
+/* debugfs interface              */
+/* ============================   */
+
+static int debugfs_ecmemory_show(struct seq_file *s, void *unused)
+{
+	struct legion_private *priv = s->private;
+	size_t offset;
+
+	for (offset = 0; offset < priv->conf->memoryio_size; ++offset) {
+		char value = ecram_read(&priv->ecram,
+					priv->conf->memoryio_physical_ec_start +
+						offset);
+
+		seq_write(s, &value, 1);
+	}
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemory);
+
+static int debugfs_fancurve_show(struct seq_file *s, void *unused)
+{
+	struct legion_private *priv = s->private;
+	bool is_minifancurve;
+	bool is_lockfancontroller;
+	bool is_maximumfanspeed;
+	int err;
+
+	seq_printf(s, "EC Chip ID: %x\n", read_ec_id(&priv->ecram, priv->conf));
+	seq_printf(s, "EC Chip Version: %x\n",
+		   read_ec_version(&priv->ecram, priv->conf));
+	seq_printf(s, "legion_laptop features: %s\n", LEGIONFEATURES);
+	seq_printf(s, "legion_laptop ec_readonly: %d\n", ec_readonly);
+	read_fancurve(&priv->ecram, priv->conf, &priv->fancurve);
+
+	seq_printf(s, "minifancurve feature enabled: %d\n",
+		   priv->conf->has_minifancurve);
+	err = read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve);
+	seq_printf(s, "minifancurve on cool: %s\n",
+		   err ? "error" : (is_minifancurve ? "true" : "false"));
+	err = read_lockfancontroller(&priv->ecram, priv->conf,
+				     &is_lockfancontroller);
+	seq_printf(s, "lock fan controller: %s\n",
+		   err ? "error" : (is_lockfancontroller ? "true" : "false"));
+	err = read_maximumfanspeed(&priv->ecram, priv->conf,
+				   &is_maximumfanspeed);
+	seq_printf(s, "enable maximumfanspeed: %s\n",
+		   err ? "error" : (is_maximumfanspeed ? "true" : "false"));
+	seq_printf(s, "enable maximumfanspeed status: %d\n", err);
+
+	seq_printf(s, "fan curve current point id: %ld\n",
+		   priv->fancurve.current_point_i);
+	seq_printf(s, "fan curve points size: %ld\n", priv->fancurve.size);
+
+	seq_puts(s, "Current fan curve in hardware (embedded controller):\n");
+	fancurve_print_seqfile(&priv->fancurve, s);
+	seq_puts(s, "=====================\n");
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_fancurve);
+
+static void legion_debugfs_init(struct legion_private *priv)
+{
+	struct dentry *dir;
+
+	// TODO: remove this note
+	// Note: as other kernel modules, do not catch errors here
+	// because if kernel is build without debugfs this
+	// will return an error but module still has to
+	// work, just without debugfs
+	// TODO: what permissions; some modules do 400
+	// other do 444
+	dir = debugfs_create_dir(LEGION_DRVR_SHORTNAME, NULL);
+	debugfs_create_file("fancurve", 0444, dir, priv,
+			    &debugfs_fancurve_fops);
+	debugfs_create_file("ecmemory", 0444, dir, priv,
+			    &debugfs_ecmemory_fops);
+
+	priv->debugfs_dir = dir;
+}
+
+static void legion_debugfs_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion dubugfs\n");
+	// The following is does nothing if pointer is NULL
+	debugfs_remove_recursive(priv->debugfs_dir);
+	priv->debugfs_dir = NULL;
+	pr_info("Unloading legion dubugfs done\n");
+}
+
+/* =============================  */
+/* sysfs interface                */
+/* ============================   */
+
+static ssize_t powermode_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int power_mode = read_powermode(&priv->ecram, priv->conf);
+
+	return sysfs_emit(buf, "%d\n", power_mode);
+}
+
+static ssize_t powermode_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int powermode;
+	int err;
+
+	err = kstrtouint(buf, 0, &powermode);
+	if (err)
+		return err;
+
+	err = write_powermode(&priv->ecram, priv->conf, powermode);
+	if (err)
+		return -EINVAL;
+
+	// TODO: better?
+	// we have to wait a bit before change is done in hardware and
+	// readback done after notifying returns correct value, otherwise
+	// the notified reader will read old value
+	msleep(500);
+	platform_profile_notify();
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(powermode);
+
+static ssize_t lockfancontroller_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	bool is_lockfancontroller;
+	int err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_lockfancontroller(&priv->ecram, priv->conf,
+				     &is_lockfancontroller);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%d\n", is_lockfancontroller);
+}
+
+static ssize_t lockfancontroller_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	bool is_lockfancontroller;
+	int err;
+
+	err = kstrtobool(buf, &is_lockfancontroller);
+	if (err)
+		return err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = write_lockfancontroller(&priv->ecram, priv->conf,
+				      is_lockfancontroller);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(lockfancontroller);
+
+static ssize_t keyboard_backlight_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	int state;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	read_keyboard_backlight(&priv->ecram, priv->conf, &state);
+	return sysfs_emit(buf, "%d\n", state);
+}
+
+static ssize_t keyboard_backlight_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int state;
+	int err;
+
+	err = kstrtouint(buf, 0, &state);
+	if (err)
+		return err;
+
+	err = write_keyboard_backlight(&priv->ecram, priv->conf, state);
+	if (err)
+		return -EINVAL;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(keyboard_backlight);
+
+static struct attribute *legion_sysfs_attributes[] = {
+	&dev_attr_powermode.attr, &dev_attr_lockfancontroller.attr,
+	&dev_attr_keyboard_backlight.attr, NULL
+};
+
+static const struct attribute_group legion_attribute_group = {
+	.attrs = legion_sysfs_attributes
+};
+
+static int legion_sysfs_init(struct legion_private *priv)
+{
+	return device_add_group(&priv->platform_device->dev,
+				&legion_attribute_group);
+}
+
+static void legion_sysfs_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion sysfs\n");
+	device_remove_group(&priv->platform_device->dev,
+			    &legion_attribute_group);
+	pr_info("Unloading legion sysfs done\n");
+}
+
+/* =============================  */
+/* WMI + ACPI                     */
+/* ============================   */
+// heavily based on ideapad_laptop.c
+
+// TODO: proper names if meaning of all events is clear
+enum LEGION_WMI_EVENT {
+	LEGION_WMI_EVENT_GAMEZONE = 1,
+	LEGION_EVENT_A,
+	LEGION_EVENT_B,
+	LEGION_EVENT_C,
+	LEGION_EVENT_D,
+	LEGION_EVENT_E,
+	LEGION_EVENT_F,
+	LEGION_EVENT_G
+};
+
+struct legion_wmi_private {
+	enum LEGION_WMI_EVENT event;
+};
+
+//static void legion_wmi_notify2(u32 value, void *context)
+//    {
+//	pr_info("WMI notify\n" );
+//    }
+
+static void legion_wmi_notify(struct wmi_device *wdev, union acpi_object *data)
+{
+	struct legion_wmi_private *wpriv;
+	struct legion_private *priv;
+
+	mutex_lock(&legion_shared_mutex);
+	priv = legion_shared;
+	if ((!priv) && (priv->loaded)) {
+		pr_info("Received WMI event while not initialized!\n");
+		goto unlock;
+	}
+
+	wpriv = dev_get_drvdata(&wdev->dev);
+	switch (wpriv->event) {
+	case LEGION_EVENT_A:
+		pr_info("Fan event: legion type: %d;  acpi type: %d (%d=integer)",
+			wpriv->event, data->type, ACPI_TYPE_INTEGER);
+		// TODO: here it is too early (first unlock mutext, then wait a bit)
+		//platform_profile_notify();
+		break;
+	default:
+		pr_info("Event: legion type: %d;  acpi type: %d (%d=integer)",
+			wpriv->event, data->type, ACPI_TYPE_INTEGER);
+		break;
+	}
+
+unlock:
+	mutex_unlock(&legion_shared_mutex);
+	// todo; fix that!
+	// problem: we get a event just before the powermode change (from the key?),
+	// so if we notify to early, it will read the old power mode/platform profile
+	msleep(500);
+	platform_profile_notify();
+}
+
+static int legion_wmi_probe(struct wmi_device *wdev, const void *context)
+{
+	struct legion_wmi_private *wpriv;
+
+	wpriv = devm_kzalloc(&wdev->dev, sizeof(*wpriv), GFP_KERNEL);
+	if (!wpriv)
+		return -ENOMEM;
+
+	*wpriv = *(const struct legion_wmi_private *)context;
+
+	dev_set_drvdata(&wdev->dev, wpriv);
+	dev_info(&wdev->dev, "Register after probing for WMI.\n");
+	return 0;
+}
+
+static const struct legion_wmi_private legion_wmi_context_gamezone = {
+	.event = LEGION_WMI_EVENT_GAMEZONE
+};
+static const struct legion_wmi_private legion_wmi_context_a = {
+	.event = LEGION_EVENT_A
+};
+static const struct legion_wmi_private legion_wmi_context_b = {
+	.event = LEGION_EVENT_B
+};
+static const struct legion_wmi_private legion_wmi_context_c = {
+	.event = LEGION_EVENT_C
+};
+static const struct legion_wmi_private legion_wmi_context_d = {
+	.event = LEGION_EVENT_D
+};
+static const struct legion_wmi_private legion_wmi_context_e = {
+	.event = LEGION_EVENT_E
+};
+static const struct legion_wmi_private legion_wmi_context_f = {
+	.event = LEGION_EVENT_F
+};
+
+// check if really a method
+#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0"
+
+#define LEGION_WMI_GUID_FAN_EVENT "D320289E-8FEA-41E0-86F9-611D83151B5F"
+#define LEGION_WMI_GUID_FAN2_EVENT "bc72a435-e8c1-4275-b3e2-d8b8074aba59"
+#define LEGION_WMI_GUID_GAMEZONE_KEY_EVENT \
+	"10afc6d9-ea8b-4590-a2e7-1cd3c84bb4b1"
+#define LEGION_WMI_GUID_GAMEZONE_GPU_EVENT \
+	"bfd42481-aee3-4502-a107-afb68425c5f8"
+#define LEGION_WMI_GUID_GAMEZONE_OC_EVENT "d062906b-12d4-4510-999d-4831ee80e985"
+#define LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT \
+	"bfd42481-aee3-4501-a107-afb68425c5f8"
+//#define LEGION_WMI_GUID_GAMEZONE_DATA_EVENT  "887b54e3-dddc-4b2c-8b88-68a26a8835d0"
+
+static const struct wmi_device_id legion_wmi_ids[] = {
+	{ LEGION_WMI_GAMEZONE_GUID, &legion_wmi_context_gamezone },
+	{ LEGION_WMI_GUID_FAN_EVENT, &legion_wmi_context_a },
+	{ LEGION_WMI_GUID_FAN2_EVENT, &legion_wmi_context_b },
+	{ LEGION_WMI_GUID_GAMEZONE_KEY_EVENT, &legion_wmi_context_c },
+	{ LEGION_WMI_GUID_GAMEZONE_GPU_EVENT, &legion_wmi_context_d },
+	{ LEGION_WMI_GUID_GAMEZONE_OC_EVENT, &legion_wmi_context_e },
+	{ LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT, &legion_wmi_context_f },
+	{ "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294",
+	  &legion_wmi_context_gamezone }, /* Legion 5 */
+	{},
+};
+MODULE_DEVICE_TABLE(wmi, legion_wmi_ids);
+
+static struct wmi_driver legion_wmi_driver = {
+	.driver = {
+		.name = "legion_wmi",
+	},
+	.id_table = legion_wmi_ids,
+	.probe = legion_wmi_probe,
+	.notify = legion_wmi_notify,
+};
+
+//acpi_status status = wmi_install_notify_handler(LEGION_WMI_GAMEZONE_GUID,
+//				legion_wmi_notify2, NULL);
+//if (ACPI_FAILURE(status)) {
+//    return -ENODEV;
+//}
+//return 0;
+
+static int legion_wmi_init(void)
+{
+	return wmi_driver_register(&legion_wmi_driver);
+}
+
+static void legion_wmi_exit(void)
+{
+	// TODO: remove this
+	pr_info("Unloading legion WMI\n");
+
+	//wmi_remove_notify_handler(LEGION_WMI_GAMEZONE_GUID);
+	wmi_driver_unregister(&legion_wmi_driver);
+	pr_info("Unloading legion WMI done\n");
+}
+
+/* =============================  */
+/* Platform profile               */
+/* ============================   */
+
+enum LEGION_POWERMODE {
+	LEGION_POWERMODE_BALANCED = 0,
+	LEGION_POWERMODE_PERFORMANCE = 1,
+	LEGION_POWERMODE_QUIET = 2,
+};
+
+static int legion_platform_profile_get(struct platform_profile_handler *pprof,
+				       enum platform_profile_option *profile)
+{
+	int powermode;
+	struct legion_private *priv;
+
+	priv = container_of(pprof, struct legion_private,
+			    platform_profile_handler);
+	powermode = read_powermode(&priv->ecram, priv->conf);
+
+	switch (powermode) {
+	case LEGION_POWERMODE_BALANCED:
+		*profile = PLATFORM_PROFILE_BALANCED;
+		break;
+	case LEGION_POWERMODE_PERFORMANCE:
+		*profile = PLATFORM_PROFILE_PERFORMANCE;
+		break;
+	case LEGION_POWERMODE_QUIET:
+		*profile = PLATFORM_PROFILE_QUIET;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int legion_platform_profile_set(struct platform_profile_handler *pprof,
+				       enum platform_profile_option profile)
+{
+	int powermode;
+	struct legion_private *priv;
+
+	priv = container_of(pprof, struct legion_private,
+			    platform_profile_handler);
+
+	switch (profile) {
+	case PLATFORM_PROFILE_BALANCED:
+		powermode = LEGION_POWERMODE_BALANCED;
+		break;
+	case PLATFORM_PROFILE_PERFORMANCE:
+		powermode = LEGION_POWERMODE_PERFORMANCE;
+		break;
+	case PLATFORM_PROFILE_QUIET:
+		powermode = LEGION_POWERMODE_QUIET;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return write_powermode(&priv->ecram, priv->conf, powermode);
+}
+
+static int legion_platform_profile_init(struct legion_private *priv)
+{
+	int err;
+
+	priv->platform_profile_handler.profile_get =
+		legion_platform_profile_get;
+	priv->platform_profile_handler.profile_set =
+		legion_platform_profile_set;
+
+	set_bit(PLATFORM_PROFILE_QUIET, priv->platform_profile_handler.choices);
+	set_bit(PLATFORM_PROFILE_BALANCED,
+		priv->platform_profile_handler.choices);
+	set_bit(PLATFORM_PROFILE_PERFORMANCE,
+		priv->platform_profile_handler.choices);
+
+	err = platform_profile_register(&priv->platform_profile_handler);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void legion_platform_profile_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion platform profile\n");
+	platform_profile_remove();
+	pr_info("Unloading legion platform profile done\n");
+}
+
+/* =============================  */
+/* hwom interface              */
+/* ============================   */
+
+// hw-mon interface
+
+// todo: register_group or register_info?
+
+// TODO: use one common function (like here) or one function per attribute?
+static ssize_t sensor_label_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	int sensor_id = (to_sensor_dev_attr(attr))->index;
+	const char *label;
+
+	switch (sensor_id) {
+	case SENSOR_CPU_TEMP_ID:
+		label = "CPU Temperature\n";
+		break;
+	case SENSOR_GPU_TEMP_ID:
+		label = "GPU Temperature\n";
+		break;
+	case SENSOR_IC_TEMP_ID:
+		label = "IC Temperature\n";
+		break;
+	case SENSOR_FAN1_RPM_ID:
+		label = "Fan 1\n";
+		break;
+	case SENSOR_FAN2_RPM_ID:
+		label = "Fan 2\n";
+		break;
+	case SENSOR_FAN1_TARGET_RPM_ID:
+		label = "Fan 1 Target\n";
+		break;
+	case SENSOR_FAN2_TARGET_RPM_ID:
+		label = "Fan 2 Target\n";
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return sprintf(buf, label);
+}
+
+// TODO: use one common function (like here) or one function per attribute?
+static ssize_t sensor_show(struct device *dev, struct device_attribute *devattr,
+			   char *buf)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int sensor_id = (to_sensor_dev_attr(devattr))->index;
+	struct sensor_values values;
+	int outval;
+
+	read_sensor_values(&priv->ecram, priv->conf, &values);
+
+	switch (sensor_id) {
+	case SENSOR_CPU_TEMP_ID:
+		outval = 1000 * values.cpu_temp_celsius;
+		break;
+	case SENSOR_GPU_TEMP_ID:
+		outval = 1000 * values.gpu_temp_celsius;
+		break;
+	case SENSOR_IC_TEMP_ID:
+		outval = 1000 * values.ic_temp_celsius;
+		break;
+	case SENSOR_FAN1_RPM_ID:
+		outval = values.fan1_rpm;
+		break;
+	case SENSOR_FAN2_RPM_ID:
+		outval = values.fan2_rpm;
+		break;
+	case SENSOR_FAN1_TARGET_RPM_ID:
+		outval = values.fan1_target_rpm;
+		break;
+	case SENSOR_FAN2_TARGET_RPM_ID:
+		outval = values.fan2_target_rpm;
+		break;
+	default:
+		pr_info("Error reading sensor value with id %d\n", sensor_id);
+		return -EOPNOTSUPP;
+	}
+
+	return sprintf(buf, "%d\n", outval);
+}
+
+static SENSOR_DEVICE_ATTR_RO(temp1_input, sensor, SENSOR_CPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp1_label, sensor_label, SENSOR_CPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp2_input, sensor, SENSOR_GPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp2_label, sensor_label, SENSOR_GPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp3_input, sensor, SENSOR_IC_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp3_label, sensor_label, SENSOR_IC_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(fan1_input, sensor, SENSOR_FAN1_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan1_label, sensor_label, SENSOR_FAN1_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan2_input, sensor, SENSOR_FAN2_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan2_label, sensor_label, SENSOR_FAN2_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan1_target, sensor, SENSOR_FAN1_TARGET_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan2_target, sensor, SENSOR_FAN2_TARGET_RPM_ID);
+
+static struct attribute *sensor_hwmon_attributes[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_label.dev_attr.attr,
+	&sensor_dev_attr_temp2_input.dev_attr.attr,
+	&sensor_dev_attr_temp2_label.dev_attr.attr,
+	&sensor_dev_attr_temp3_input.dev_attr.attr,
+	&sensor_dev_attr_temp3_label.dev_attr.attr,
+	&sensor_dev_attr_fan1_input.dev_attr.attr,
+	&sensor_dev_attr_fan1_label.dev_attr.attr,
+	&sensor_dev_attr_fan2_input.dev_attr.attr,
+	&sensor_dev_attr_fan2_label.dev_attr.attr,
+	&sensor_dev_attr_fan1_target.dev_attr.attr,
+	&sensor_dev_attr_fan2_target.dev_attr.attr,
+	NULL
+};
+
+static ssize_t autopoint_show(struct device *dev,
+			      struct device_attribute *devattr, char *buf)
+{
+	struct fancurve fancurve;
+	int err;
+	int value;
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
+	int point_id = to_sensor_dev_attr_2(devattr)->index;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_fancurve(&priv->ecram, priv->conf, &fancurve);
+	mutex_unlock(&priv->fancurve_mutex);
+
+	if (err) {
+		pr_info("Reading fancurve failed\n");
+		return -EOPNOTSUPP;
+	}
+	if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
+		pr_info("Reading fancurve failed due to wrong point id: %d\n",
+			point_id);
+		return -EOPNOTSUPP;
+	}
+
+	switch (fancurve_attr_id) {
+	case FANCURVE_ATTR_PWM1:
+		value = fancurve.points[point_id].rpm1_raw * 100;
+		break;
+	case FANCURVE_ATTR_PWM2:
+		value = fancurve.points[point_id].rpm2_raw * 100;
+		break;
+	case FANCURVE_ATTR_CPU_TEMP:
+		value = fancurve.points[point_id].cpu_max_temp_celsius;
+		break;
+	case FANCURVE_ATTR_CPU_HYST:
+		value = fancurve.points[point_id].cpu_min_temp_celsius;
+		break;
+	case FANCURVE_ATTR_GPU_TEMP:
+		value = fancurve.points[point_id].gpu_max_temp_celsius;
+		break;
+	case FANCURVE_ATTR_GPU_HYST:
+		value = fancurve.points[point_id].gpu_min_temp_celsius;
+		break;
+	case FANCURVE_ATTR_IC_TEMP:
+		value = fancurve.points[point_id].ic_max_temp_celsius;
+		break;
+	case FANCURVE_ATTR_IC_HYST:
+		value = fancurve.points[point_id].ic_min_temp_celsius;
+		break;
+	case FANCURVE_ATTR_ACCEL:
+		value = fancurve.points[point_id].accel;
+		break;
+	case FANCURVE_ATTR_DECEL:
+		value = fancurve.points[point_id].decel;
+		break;
+	case FANCURVE_SIZE:
+		value = fancurve.size;
+		break;
+	default:
+		pr_info("Reading fancurve failed due to wrong attribute id: %d\n",
+			fancurve_attr_id);
+		return -EOPNOTSUPP;
+	}
+
+	return sprintf(buf, "%d\n", value);
+}
+
+static ssize_t autopoint_store(struct device *dev,
+			       struct device_attribute *devattr,
+			       const char *buf, size_t count)
+{
+	struct fancurve fancurve;
+	int err;
+	int value;
+	bool valid;
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
+	int point_id = to_sensor_dev_attr_2(devattr)->index;
+
+	if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
+		pr_info("Reading fancurve failed due to wrong point id: %d\n",
+			point_id);
+		err = -EOPNOTSUPP;
+		goto error;
+	}
+
+	err = kstrtoint(buf, 0, &value);
+	if (err) {
+		pr_info("Parse for hwmon store is not succesful: error:%d; point_id: %d; fancurve_attr_id: %d\\n",
+			err, point_id, fancurve_attr_id);
+		goto error;
+	}
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_fancurve(&priv->ecram, priv->conf, &fancurve);
+
+	if (err) {
+		pr_info("Reading fancurve failed\n");
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	switch (fancurve_attr_id) {
+	case FANCURVE_ATTR_PWM1:
+		valid = fancurve_set_rpm1(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_PWM2:
+		valid = fancurve_set_rpm2(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_CPU_TEMP:
+		valid = fancurve_set_cpu_temp_max(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_CPU_HYST:
+		valid = fancurve_set_cpu_temp_min(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_GPU_TEMP:
+		valid = fancurve_set_gpu_temp_max(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_GPU_HYST:
+		valid = fancurve_set_gpu_temp_min(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_IC_TEMP:
+		valid = fancurve_set_ic_temp_max(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_IC_HYST:
+		valid = fancurve_set_ic_temp_min(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_ACCEL:
+		valid = fancurve_set_accel(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_DECEL:
+		valid = fancurve_set_decel(&fancurve, point_id, value);
+		break;
+	case FANCURVE_SIZE:
+		valid = fancurve_set_size(&fancurve, value, true);
+		break;
+	default:
+		pr_info("Writing fancurve failed due to wrong attribute id: %d\n",
+			fancurve_attr_id);
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	if (!valid) {
+		pr_info("Ignoring invalid fancurve value %d for attribute %d at point %d\n",
+			value, fancurve_attr_id, point_id);
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	err = write_fancurve(&priv->ecram, priv->conf, &fancurve, false);
+	if (err) {
+		pr_info("Writing fancurve failed for accessing hwmon at point_id: %d\n",
+			point_id);
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	mutex_unlock(&priv->fancurve_mutex);
+	return count;
+
+error_mutex:
+	mutex_unlock(&priv->fancurve_mutex);
+error:
+	return count;
+}
+
+// rpm1
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 9);
+// rpm2
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 9);
+// CPU temp
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 9);
+// CPU temp hyst
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 9);
+// GPU temp
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 9);
+// GPU temp hyst
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 9);
+// IC temp
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 9);
+// IC temp hyst
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 9);
+// accel
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 9);
+// decel
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 9);
+//size
+static SENSOR_DEVICE_ATTR_2_RW(auto_points_size, autopoint, FANCURVE_SIZE, 0);
+
+static ssize_t minifancurve_show(struct device *dev,
+				 struct device_attribute *devattr, char *buf)
+{
+	bool value;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_minifancurve(&priv->ecram, priv->conf, &value);
+	if (err) {
+		err = -1;
+		pr_info("Reading minifancurve not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return sprintf(buf, "%d\n", value);
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+	return -1;
+}
+
+static ssize_t minifancurve_store(struct device *dev,
+				  struct device_attribute *devattr,
+				  const char *buf, size_t count)
+{
+	int value;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	err = kstrtoint(buf, 0, &value);
+	if (err) {
+		err = -1;
+		pr_info("Parse for hwmon store is not succesful: error:%d\n",
+			err);
+		goto error;
+	}
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = write_minifancurve(&priv->ecram, priv->conf, value);
+	if (err) {
+		err = -1;
+		pr_info("Writing minifancurve not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return count;
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+error:
+	return err;
+}
+
+static SENSOR_DEVICE_ATTR_RW(minifancurve, minifancurve, 0);
+
+static ssize_t pwm1_mode_show(struct device *dev,
+			      struct device_attribute *devattr, char *buf)
+{
+	bool value;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_maximumfanspeed(&priv->ecram, priv->conf, &value);
+	if (err) {
+		err = -1;
+		pr_info("Reading pwm1_mode/maximumfanspeed not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return sprintf(buf, "%d\n", value ? 0 : 2);
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+	return -1;
+}
+
+static ssize_t pwm1_mode_store(struct device *dev,
+			       struct device_attribute *devattr,
+			       const char *buf, size_t count)
+{
+	int value;
+	int is_maximumfanspeed;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	err = kstrtoint(buf, 0, &value);
+	if (err) {
+		err = -1;
+		pr_info("Parse for hwmon store is not succesful: error:%d\n",
+			err);
+		goto error;
+	}
+	is_maximumfanspeed = value == 0;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = write_maximumfanspeed(&priv->ecram, priv->conf,
+				    is_maximumfanspeed);
+	if (err) {
+		err = -1;
+		pr_info("Writing pwm1_mode/maximumfanspeed not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return count;
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+error:
+	return err;
+}
+
+static SENSOR_DEVICE_ATTR_RW(pwm1_mode, pwm1_mode, 0);
+
+static struct attribute *fancurve_hwmon_attributes[] = {
+	&sensor_dev_attr_pwm1_auto_point1_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point1_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point2_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point3_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point4_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point5_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point6_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point7_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point8_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point9_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point10_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point1_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point2_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point3_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point4_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point5_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point6_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point7_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point8_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point9_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point10_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point1_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point2_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point3_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point4_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point5_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point6_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point7_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point8_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point9_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point10_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point1_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point2_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point3_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point4_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point5_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point6_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point7_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point8_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point9_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point10_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point1_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point2_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point3_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point4_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point5_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point6_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point7_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point8_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point9_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point10_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_decel.dev_attr.attr,
+	//
+	&sensor_dev_attr_auto_points_size.dev_attr.attr,
+	&sensor_dev_attr_minifancurve.dev_attr.attr,
+	&sensor_dev_attr_pwm1_mode.dev_attr.attr, NULL
+};
+
+static umode_t legion_is_visible(struct kobject *kobj, struct attribute *attr,
+				 int idx)
+{
+	bool supported = true;
+	struct device *dev = kobj_to_dev(kobj);
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	if (attr == &sensor_dev_attr_minifancurve.dev_attr.attr)
+		supported = priv->conf->has_minifancurve;
+
+	return supported ? attr->mode : 0;
+}
+
+static const struct attribute_group legion_hwmon_sensor_group = {
+	.attrs = sensor_hwmon_attributes,
+	.is_visible = NULL
+};
+
+static const struct attribute_group legion_hwmon_fancurve_group = {
+	.attrs = fancurve_hwmon_attributes,
+	.is_visible = legion_is_visible,
+};
+
+static const struct attribute_group *legion_hwmon_groups[] = {
+	&legion_hwmon_sensor_group, &legion_hwmon_fancurve_group, NULL
+};
+
+ssize_t legion_hwmon_init(struct legion_private *priv)
+{
+	//TODO: use hwmon_device_register_with_groups or
+	// hwmon_device_register_with_info (latter means all hwmon functions have to be
+	// changed)
+	// some laptop driver do it in one way, some in the other
+	// TODO: Use devm_hwmon_device_register_with_groups ?
+	// some laptop drivers use this, some
+	struct device *hwmon_dev = hwmon_device_register_with_groups(
+		&priv->platform_device->dev, "legion_hwmon", priv,
+		legion_hwmon_groups);
+	if (IS_ERR_OR_NULL(hwmon_dev)) {
+		pr_err("hwmon_device_register failed!\n");
+		return PTR_ERR(hwmon_dev);
+	}
+	dev_set_drvdata(hwmon_dev, priv);
+	priv->hwmon_dev = hwmon_dev;
+	return 0;
+}
+
+void legion_hwmon_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion hwon\n");
+	if (priv->hwmon_dev) {
+		hwmon_device_unregister(priv->hwmon_dev);
+		priv->hwmon_dev = NULL;
+	}
+	pr_info("Unloading legion hwon done\n");
+}
+
+/* =============================  */
+/* Platform driver                */
+/* ============================   */
+
+int legion_add(struct platform_device *pdev)
+{
+	struct legion_private *priv;
+	const struct dmi_system_id *dmi_sys;
+	int err;
+	u16 ec_read_id;
+	bool is_denied = true;
+	bool is_allowed = false;
+	bool do_load_by_list = false;
+	bool do_load = false;
+	//struct legion_private *priv = dev_get_drvdata(&pdev->dev);
+	dev_info(&pdev->dev, "legion_laptop platform driver probing\n");
+
+	dev_info(&pdev->dev, "Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
+		dmi_get_system_info(DMI_SYS_VENDOR),
+		dmi_get_system_info(DMI_PRODUCT_NAME),
+		dmi_get_system_info(DMI_BIOS_VERSION));
+
+	// TODO: allocate?
+	priv = &_priv;
+	priv->platform_device = pdev;
+	err = legion_shared_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
+		goto err_legion_shared_init;
+	}
+	dev_set_drvdata(&pdev->dev, priv);
+
+	// TODO: remove
+	pr_info("Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
+		dmi_get_system_info(DMI_SYS_VENDOR),
+		dmi_get_system_info(DMI_PRODUCT_NAME),
+		dmi_get_system_info(DMI_BIOS_VERSION));
+
+	dmi_sys = dmi_first_match(optimistic_allowlist);
+	is_allowed = dmi_sys != NULL;
+	is_denied = dmi_check_system(denylist);
+	do_load_by_list = is_allowed && !is_denied;
+	do_load = do_load_by_list || force;
+
+	dev_info(
+		&pdev->dev,
+		"is_denied: %d; is_allowed: %d; do_load_by_list: %d; do_load: %d\n",
+		is_denied, is_allowed, do_load_by_list, do_load);
+
+	if (!(do_load)) {
+		dev_info(
+			&pdev->dev,
+			"Module not useable for this laptop because it is not in allowlist. Notify maintainer if you want to add your device or force load with param force.\n");
+		err = -ENOMEM;
+		goto err_model_mismtach;
+	}
+
+	if (force)
+		dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
+
+	if (!do_load_by_list && do_load) {
+		dev_info(
+			&pdev->dev,
+			"legion_laptop is forced to load and would otherwise be not loaded\n");
+	}
+
+	// if forced and no module found, use config for first model
+	if (dmi_sys == NULL)
+		dmi_sys = &optimistic_allowlist[0];
+	dev_info(&pdev->dev, "Using configuration for system: %s\n",
+		 dmi_sys->ident);
+
+	priv->conf = dmi_sys->driver_data;
+
+	err = ecram_init(&priv->ecram, priv->conf->memoryio_physical_ec_start,
+			 priv->conf->memoryio_size);
+	if (err) {
+		dev_info(&pdev->dev,
+			 "Could not init access to embedded controller\n");
+		goto err_ecram_init;
+	}
+
+	ec_read_id = read_ec_id(&priv->ecram, priv->conf);
+	dev_info(&pdev->dev, "Read embedded controller ID 0x%x\n", ec_read_id);
+	if (priv->conf->check_embedded_controller_id &&
+	    !(ec_read_id == priv->conf->embedded_controller_id)) {
+		err = -ENOMEM;
+		dev_info(&pdev->dev, "Expected EC chip id 0x%x but read 0x%x\n",
+			 priv->conf->embedded_controller_id, ec_read_id);
+		goto err_ecram_id;
+	}
+	if (!priv->conf->check_embedded_controller_id) {
+		dev_info(&pdev->dev,
+			 "Skipped checking embedded controller id\n");
+	}
+
+	dev_info(&pdev->dev, "Creating debugfs inteface\n");
+	legion_debugfs_init(priv);
+
+	pr_info("Creating sysfs inteface\n");
+	err = legion_sysfs_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "Creating sysfs interface failed\n");
+		goto err_sysfs_init;
+	}
+
+	pr_info("Creating hwmon interface");
+	err = legion_hwmon_init(priv);
+	if (err)
+		goto err_hwmon_init;
+
+	pr_info("Creating platform profile support\n");
+	err = legion_platform_profile_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "Creating platform profile failed\n");
+		goto err_platform_profile;
+	}
+
+	pr_info("Init WMI driver support\n");
+	err = legion_wmi_init();
+	if (err) {
+		dev_info(&pdev->dev, "Init WMI driver failed\n");
+		goto err_wmi;
+	}
+
+	dev_info(&pdev->dev, "legion_laptop loaded for this device\n");
+	return 0;
+
+	// TODO: remove eventually
+	legion_wmi_exit();
+err_wmi:
+	legion_platform_profile_exit(priv);
+err_platform_profile:
+	legion_hwmon_exit(priv);
+err_hwmon_init:
+	legion_sysfs_exit(priv);
+err_sysfs_init:
+	legion_debugfs_exit(priv);
+err_ecram_id:
+	ecram_exit(&priv->ecram);
+err_ecram_init:
+	legion_shared_exit(priv);
+err_legion_shared_init:
+err_model_mismtach:
+	dev_info(&pdev->dev, "legion_laptop not loaded for this device\n");
+	return err;
+}
+
+int legion_remove(struct platform_device *pdev)
+{
+	struct legion_private *priv = dev_get_drvdata(&pdev->dev);
+
+	mutex_lock(&legion_shared_mutex);
+	priv->loaded = false;
+	mutex_unlock(&legion_shared_mutex);
+
+	// first unregister wmi, so toggling powermode does not
+	// generate events anymore that even might be delayed
+	legion_wmi_exit();
+	legion_platform_profile_exit(priv);
+
+	// toggle power mode to load default setting from embedded controller
+	// again
+	toggle_powermode(&priv->ecram, priv->conf);
+
+	legion_hwmon_exit(priv);
+	legion_sysfs_exit(priv);
+	legion_debugfs_exit(priv);
+	ecram_exit(&priv->ecram);
+	legion_shared_exit(priv);
+
+	pr_info("Legion platform unloaded\n");
+	return 0;
+}
+
+int legion_resume(struct platform_device *pdev)
+{
+	//struct legion_private *priv = dev_get_drvdata(&pdev->dev);
+	dev_info(&pdev->dev, "Resumed in legion-laptop\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int legion_pm_resume(struct device *dev)
+{
+	//struct legion_private *priv = dev_get_drvdata(dev);
+	dev_info(dev, "Resumed PM in legion-laptop\n");
+
+	return 0;
+}
+#endif
+static SIMPLE_DEV_PM_OPS(legion_pm, NULL, legion_pm_resume);
+
+// same as ideapad
+static const struct acpi_device_id legion_device_ids[] = {
+	{ "PNP0C09", 0 }, // todo: change to "VPC2004"
+	{ "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, legion_device_ids);
+
+static struct platform_driver legion_driver = {
+	.probe = legion_add,
+	.remove = legion_remove,
+	.resume = legion_resume,
+	.driver = {
+		.name   = "legion",
+		.pm     = &legion_pm,
+		.acpi_match_table = ACPI_PTR(legion_device_ids),
+	},
+};
+
+int __init legion_init(void)
+{
+	int err;
+
+	pr_info("legion_laptop starts loading\n");
+	err = platform_driver_register(&legion_driver);
+	if (err) {
+		pr_info("legion_laptop: platform_driver_register failed\n");
+		return err;
+	}
+
+	return 0;
+}
+
+module_init(legion_init);
+
+void __exit legion_exit(void)
+{
+	platform_driver_unregister(&legion_driver);
+	pr_info("legion_laptop exit\n");
+}
+
+module_exit(legion_exit);
diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c
new file mode 100644
index 000000000000..77a6677ec19e
--- /dev/null
+++ b/drivers/platform/x86/steamdeck.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Steam Deck ACPI platform driver
+ *
+ * Copyright (C) 2021-2022 Valve Corporation
+ *
+ */
+#include <linux/acpi.h>
+#include <linux/hwmon.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/extcon-provider.h>
+
+#define ACPI_STEAMDECK_NOTIFY_STATUS	0x80
+
+/* 0 - port connected, 1 -port disconnected */
+#define ACPI_STEAMDECK_PORT_CONNECT	BIT(0)
+/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */
+#define ACPI_STEAMDECK_CUR_DATA_ROLE	BIT(3)
+/*
+ * Debouncing delay to allow negotiation process to settle. 2s value
+ * was arrived at via trial and error.
+ */
+#define STEAMDECK_ROLE_SWITCH_DELAY	(msecs_to_jiffies(2000))
+
+struct steamdeck {
+	struct acpi_device *adev;
+	struct device *hwmon;
+	void *regmap;
+	long fan_target;
+	struct delayed_work role_work;
+	struct extcon_dev *edev;
+	struct device *dev;
+};
+
+static ssize_t
+steamdeck_simple_store(struct device *dev, const char *buf, size_t count,
+		       const char *method,
+		       unsigned long upper_limit)
+{
+	struct steamdeck *fan = dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (kstrtoul(buf, 10, &value) || value >= upper_limit)
+		return -EINVAL;
+
+	if (ACPI_FAILURE(acpi_execute_simple_method(fan->adev->handle,
+						    (char *)method, value)))
+		return -EIO;
+
+	return count;
+}
+
+#define STEAMDECK_ATTR_WO(_name, _method, _upper_limit)			\
+	static ssize_t _name##_store(struct device *dev,		\
+				     struct device_attribute *attr,	\
+				     const char *buf, size_t count)	\
+	{								\
+		return steamdeck_simple_store(dev, buf, count,		\
+					    _method,			\
+					    _upper_limit);		\
+	}								\
+	static DEVICE_ATTR_WO(_name)
+
+STEAMDECK_ATTR_WO(target_cpu_temp, "STCT", U8_MAX / 2);
+STEAMDECK_ATTR_WO(gain, "SGAN", U16_MAX);
+STEAMDECK_ATTR_WO(ramp_rate, "SFRR", U8_MAX);
+STEAMDECK_ATTR_WO(hysteresis, "SHTS",  U16_MAX);
+STEAMDECK_ATTR_WO(maximum_battery_charge_rate, "CHGR", U16_MAX);
+STEAMDECK_ATTR_WO(recalculate, "SCHG", U16_MAX);
+
+STEAMDECK_ATTR_WO(led_brightness, "CHBV", U8_MAX);
+STEAMDECK_ATTR_WO(content_adaptive_brightness, "CABC", U8_MAX);
+STEAMDECK_ATTR_WO(gamma_set, "GAMA", U8_MAX);
+STEAMDECK_ATTR_WO(display_brightness, "WDBV", U8_MAX);
+STEAMDECK_ATTR_WO(ctrl_display, "WCDV", U8_MAX);
+STEAMDECK_ATTR_WO(cabc_minimum_brightness, "WCMB", U8_MAX);
+STEAMDECK_ATTR_WO(memory_data_access_control, "MDAC", U8_MAX);
+
+#define STEAMDECK_ATTR_WO_NOARG(_name, _method)				\
+	static ssize_t _name##_store(struct device *dev,		\
+				     struct device_attribute *attr,	\
+				     const char *buf, size_t count)	\
+	{								\
+		struct steamdeck *fan = dev_get_drvdata(dev);		\
+									\
+		if (ACPI_FAILURE(acpi_evaluate_object(fan->adev->handle, \
+						      _method, NULL, NULL))) \
+			return -EIO;					\
+									\
+		return count;						\
+	}								\
+	static DEVICE_ATTR_WO(_name)
+
+STEAMDECK_ATTR_WO_NOARG(power_cycle_display, "DPCY");
+STEAMDECK_ATTR_WO_NOARG(display_normal_mode_on, "NORO");
+STEAMDECK_ATTR_WO_NOARG(display_inversion_off, "INOF");
+STEAMDECK_ATTR_WO_NOARG(display_inversion_on, "INON");
+STEAMDECK_ATTR_WO_NOARG(idle_mode_on, "WRNE");
+
+#define STEAMDECK_ATTR_RO(_name, _method)				\
+	static ssize_t _name##_show(struct device *dev,			\
+				    struct device_attribute *attr,	\
+				    char *buf)				\
+	{								\
+		struct steamdeck *jup = dev_get_drvdata(dev);		\
+		unsigned long long val;					\
+									\
+		if (ACPI_FAILURE(acpi_evaluate_integer(			\
+					 jup->adev->handle,		\
+					 _method, NULL, &val)))		\
+			return -EIO;					\
+									\
+		return sprintf(buf, "%llu\n", val);			\
+	}								\
+	static DEVICE_ATTR_RO(_name)
+
+STEAMDECK_ATTR_RO(firmware_version, "PDFW");
+STEAMDECK_ATTR_RO(board_id, "BOID");
+STEAMDECK_ATTR_RO(pdcs, "PDCS");
+
+static umode_t
+steamdeck_is_visible(struct kobject *kobj, struct attribute *attr, int index)
+{
+	return attr->mode;
+}
+
+static struct attribute *steamdeck_attributes[] = {
+	&dev_attr_target_cpu_temp.attr,
+	&dev_attr_gain.attr,
+	&dev_attr_ramp_rate.attr,
+	&dev_attr_hysteresis.attr,
+	&dev_attr_maximum_battery_charge_rate.attr,
+	&dev_attr_recalculate.attr,
+	&dev_attr_power_cycle_display.attr,
+
+	&dev_attr_led_brightness.attr,
+	&dev_attr_content_adaptive_brightness.attr,
+	&dev_attr_gamma_set.attr,
+	&dev_attr_display_brightness.attr,
+	&dev_attr_ctrl_display.attr,
+	&dev_attr_cabc_minimum_brightness.attr,
+	&dev_attr_memory_data_access_control.attr,
+
+	&dev_attr_display_normal_mode_on.attr,
+	&dev_attr_display_inversion_off.attr,
+	&dev_attr_display_inversion_on.attr,
+	&dev_attr_idle_mode_on.attr,
+
+	&dev_attr_firmware_version.attr,
+	&dev_attr_board_id.attr,
+	&dev_attr_pdcs.attr,
+
+	NULL
+};
+
+static const struct attribute_group steamdeck_group = {
+	.attrs = steamdeck_attributes,
+	.is_visible = steamdeck_is_visible,
+};
+
+static const struct attribute_group *steamdeck_groups[] = {
+	&steamdeck_group,
+	NULL
+};
+
+static int steamdeck_read_fan_speed(struct steamdeck *jup, long *speed)
+{
+	unsigned long long val;
+
+	if (ACPI_FAILURE(acpi_evaluate_integer(jup->adev->handle,
+					       "FANR", NULL, &val)))
+		return -EIO;
+
+	*speed = val;
+	return 0;
+}
+
+static int
+steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+		     u32 attr, int channel, long *out)
+{
+	struct steamdeck *sd = dev_get_drvdata(dev);
+	unsigned long long val;
+
+	switch (type) {
+	case hwmon_temp:
+		if (attr != hwmon_temp_input)
+			return -EOPNOTSUPP;
+
+		if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
+						       "BATT", NULL, &val)))
+			return -EIO;
+		/*
+		 * Assuming BATT returns deg C we need to mutiply it
+		 * by 1000 to convert to mC
+		 */
+		*out = val * 1000;
+		break;
+	case hwmon_fan:
+		switch (attr) {
+		case hwmon_fan_input:
+			return steamdeck_read_fan_speed(sd, out);
+		case hwmon_fan_target:
+			*out = sd->fan_target;
+			break;
+		case hwmon_fan_fault:
+			if (ACPI_FAILURE(acpi_evaluate_integer(
+						 sd->adev->handle,
+						 "FANC", NULL, &val)))
+				return -EIO;
+			/*
+			 * FANC (Fan check):
+			 * 0: Abnormal
+			 * 1: Normal
+			 */
+			*out = !val;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int
+steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type,
+			    u32 attr, int channel, const char **str)
+{
+	switch (type) {
+	case hwmon_temp:
+		*str = "Battery Temp";
+		break;
+	case hwmon_fan:
+		*str = "System Fan";
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int
+steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+		      u32 attr, int channel, long val)
+{
+	struct steamdeck *sd = dev_get_drvdata(dev);
+
+	if (type != hwmon_fan ||
+	    attr != hwmon_fan_target)
+		return -EOPNOTSUPP;
+
+	if (val > U16_MAX)
+		return -EINVAL;
+
+	sd->fan_target = val;
+
+	if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
+						    "FANS", val)))
+		return -EIO;
+
+	return 0;
+}
+
+static umode_t
+steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type,
+			   u32 attr, int channel)
+{
+	if (type == hwmon_fan &&
+	    attr == hwmon_fan_target)
+		return 0644;
+
+	return 0444;
+}
+
+static const struct hwmon_channel_info *steamdeck_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_INPUT | HWMON_T_LABEL),
+	HWMON_CHANNEL_INFO(fan,
+			   HWMON_F_INPUT | HWMON_F_LABEL |
+			   HWMON_F_TARGET | HWMON_F_FAULT),
+	NULL
+};
+
+static const struct hwmon_ops steamdeck_hwmon_ops = {
+	.is_visible = steamdeck_hwmon_is_visible,
+	.read = steamdeck_hwmon_read,
+	.read_string = steamdeck_hwmon_read_string,
+	.write = steamdeck_hwmon_write,
+};
+
+static const struct hwmon_chip_info steamdeck_chip_info = {
+	.ops = &steamdeck_hwmon_ops,
+	.info = steamdeck_info,
+};
+
+#define STEAMDECK_STA_OK			\
+	(ACPI_STA_DEVICE_ENABLED |		\
+	 ACPI_STA_DEVICE_PRESENT |		\
+	 ACPI_STA_DEVICE_FUNCTIONING)
+
+static int
+steamdeck_ddic_reg_read(void *context, unsigned int reg, unsigned int *val)
+{
+	union acpi_object obj = { .type = ACPI_TYPE_INTEGER };
+	struct acpi_object_list arg_list = { .count = 1, .pointer = &obj, };
+	struct steamdeck *sd = context;
+	unsigned long long _val;
+
+	obj.integer.value = reg;
+
+	if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
+					       "RDDI", &arg_list, &_val)))
+		return -EIO;
+
+	*val = _val;
+	return 0;
+}
+
+static int steamdeck_read_pdcs(struct steamdeck *sd, unsigned long long *pdcs)
+{
+	acpi_status status;
+
+	status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs);
+	if (ACPI_FAILURE(status)) {
+		dev_err(sd->dev, "PDCS evaluation failed: %s\n",
+			acpi_format_exception(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void steamdeck_usb_role_work(struct work_struct *work)
+{
+	struct steamdeck *sd =
+		container_of(work, struct steamdeck, role_work.work);
+	unsigned long long pdcs;
+	bool usb_host;
+
+	if (steamdeck_read_pdcs(sd, &pdcs))
+		return;
+
+	/*
+	 * We only care about these two
+	 */
+	pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE;
+
+	/*
+	 * For "connect" events our role is determined by a bit in
+	 * PDCS, for "disconnect" we switch to being a gadget
+	 * unconditionally. The thinking for the latter is we don't
+	 * want to start acting as a USB host until we get
+	 * confirmation from the firmware that we are a USB host
+	 */
+	usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
+		pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false;
+
+	WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST,
+				      usb_host));
+	dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device");
+}
+
+static void steamdeck_notify(acpi_handle handle, u32 event, void *context)
+{
+	struct device *dev = context;
+	struct steamdeck *sd = dev_get_drvdata(dev);
+	unsigned long long pdcs;
+	unsigned long delay;
+
+	switch (event) {
+	case ACPI_STEAMDECK_NOTIFY_STATUS:
+		if (steamdeck_read_pdcs(sd, &pdcs))
+			return;
+		/*
+		 * We process "disconnect" events immediately and
+		 * "connect" events with a delay to give the HW time
+		 * to settle. For example attaching USB hub (at least
+		 * for HW used for testing) will generate intermediary
+		 * event with "host" bit not set, followed by the one
+		 * that does have it set.
+		 */
+		delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
+			STEAMDECK_ROLE_SWITCH_DELAY : 0;
+
+		queue_delayed_work(system_long_wq, &sd->role_work, delay);
+		break;
+	default:
+		dev_err(dev, "Unsupported event [0x%x]\n", event);
+	}
+}
+
+static void steamdeck_remove_notify_handler(void *data)
+{
+	struct steamdeck *sd = data;
+
+	acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY,
+				   steamdeck_notify);
+	cancel_delayed_work_sync(&sd->role_work);
+}
+
+static const unsigned int steamdeck_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_CHG_USB_SDP,
+	EXTCON_CHG_USB_CDP,
+	EXTCON_CHG_USB_DCP,
+	EXTCON_CHG_USB_ACA,
+	EXTCON_NONE,
+};
+
+static int steamdeck_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct steamdeck *sd;
+	acpi_status status;
+	unsigned long long sta;
+	int ret;
+
+	static const struct regmap_config regmap_config = {
+		.reg_bits = 8,
+		.val_bits = 8,
+		.max_register = 255,
+		.cache_type = REGCACHE_NONE,
+		.reg_read = steamdeck_ddic_reg_read,
+	};
+
+	sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return -ENOMEM;
+	sd->adev = ACPI_COMPANION(&pdev->dev);
+	sd->dev = dev;
+	platform_set_drvdata(pdev, sd);
+	INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work);
+
+	status = acpi_evaluate_integer(sd->adev->handle, "_STA",
+				       NULL, &sta);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "Status check failed (0x%x)\n", status);
+		return -EINVAL;
+	}
+
+	if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) {
+		dev_err(dev, "Device is not ready\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Our ACPI interface doesn't expose a method to read current
+	 * fan target, so we use current fan speed as an
+	 * approximation.
+	 */
+	if (steamdeck_read_fan_speed(sd, &sd->fan_target))
+		dev_warn(dev, "Failed to read fan speed");
+
+	sd->hwmon = devm_hwmon_device_register_with_info(dev,
+							 "steamdeck",
+							 sd,
+							 &steamdeck_chip_info,
+							 steamdeck_groups);
+	if (IS_ERR(sd->hwmon)) {
+		dev_err(dev, "Failed to register HWMON device");
+		return PTR_ERR(sd->hwmon);
+	}
+
+	sd->regmap = devm_regmap_init(dev, NULL, sd, &regmap_config);
+	if (IS_ERR(sd->regmap))
+		dev_err(dev, "Failed to register REGMAP");
+
+	sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable);
+	if (IS_ERR(sd->edev))
+		return -ENOMEM;
+
+	ret = devm_extcon_dev_register(dev, sd->edev);
+	if (ret < 0) {
+		dev_err(dev, "Failed to register extcon device: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Set initial role value
+	 */
+	queue_delayed_work(system_long_wq, &sd->role_work, 0);
+	flush_delayed_work(&sd->role_work);
+
+	status = acpi_install_notify_handler(sd->adev->handle,
+					     ACPI_DEVICE_NOTIFY,
+					     steamdeck_notify,
+					     dev);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "Error installing ACPI notify handler\n");
+		return -EIO;
+	}
+
+	ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler,
+				       sd);
+	return ret;
+}
+
+static const struct acpi_device_id steamdeck_device_ids[] = {
+	{ "VLV0100", 0 },
+	{ "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids);
+
+static struct platform_driver steamdeck_driver = {
+	.probe = steamdeck_probe,
+	.driver = {
+		.name = "steamdeck",
+		.acpi_match_table = steamdeck_device_ids,
+	},
+};
+module_platform_driver(steamdeck_driver);
+
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ced82b9c18e5..7e0a7b60c9f9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -200,7 +200,7 @@ static inline void __mm_zero_struct_page(struct page *page)
  * that.
  */
 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+#define DEFAULT_MAX_MAP_COUNT	(INT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 
 extern int sysctl_max_map_count;
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0acb8e1fb7af..b0b49c8653b0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1182,7 +1182,7 @@ struct readahead_control {
 		._index = i,						\
 	}
 
-#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
+#define VM_READAHEAD_PAGES	(SZ_8M / PAGE_SIZE)
 
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..87b20e2ee274 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
 
 #ifdef CONFIG_USER_NS
 
+extern int unprivileged_userns_clone;
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
 struct ns_common *ns_get_owner(struct ns_common *ns);
 #else
 
+#define unprivileged_userns_clone 0
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/init/Kconfig b/init/Kconfig
index c88bb30a8b0b..908d045dbe10 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK
 
 menu "General setup"
 
+config CACHY
+    bool "Some kernel tweaks by CachyOS"
+    default y
+
 config BROKEN
 	bool
 
@@ -348,6 +352,19 @@ config KERNEL_UNCOMPRESSED
 
 endchoice
 
+menu "ZSTD compression options"
+	depends on KERNEL_ZSTD
+
+config ZSTD_COMPRESSION_LEVEL
+	int "Compression level (1-22)"
+	range 1 22
+	default "22"
+	help
+	  Choose a compression level for zstd kernel compression.
+	  Default is 22, which is the maximum.
+
+endmenu
+
 config DEFAULT_INIT
 	string "Default init path"
 	default ""
@@ -1249,6 +1266,22 @@ config USER_NS
 
 	  If unsure, say N.
 
+config USER_NS_UNPRIVILEGED
+	bool "Allow unprivileged users to create namespaces"
+	default y
+	depends on USER_NS
+	help
+	  When disabled, unprivileged users will not be able to create
+	  new namespaces. Allowing users to create their own namespaces
+	  has been part of several recent local privilege escalation
+	  exploits, so if you need user namespaces but are
+	  paranoid^Wsecurity-conscious you want to disable this.
+
+	  This setting can be overridden at runtime via the
+	  kernel.unprivileged_userns_clone sysctl.
+
+	  If unsure, say Y.
+
 config PID_NS
 	bool "PID Namespaces"
 	default y
@@ -1429,6 +1462,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
 	  with the "-O2" compiler flag for best performance and most
 	  helpful compile-time warnings.
 
+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
+	bool "Optimize more for performance (-O3)"
+	help
+	  Choosing this option will pass "-O3" to your compiler to optimize
+	  the kernel yet more for performance.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size (-Os)"
 	help
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 38ef6d06888e..0f78364efd4f 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -40,6 +40,27 @@ choice
 	 on SMP and NUMA systems and exactly dividing by both PAL and
 	 NTSC frame rates for video and multimedia work.
 
+	config HZ_500
+		bool "500 HZ"
+	help
+	 500 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_600
+		bool "600 HZ"
+	help
+	 600 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_750
+		bool "750 HZ"
+	help
+	 750 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
 	config HZ_1000
 		bool "1000 HZ"
 	help
@@ -53,6 +74,9 @@ config HZ
 	default 100 if HZ_100
 	default 250 if HZ_250
 	default 300 if HZ_300
+	default 500 if HZ_500
+	default 600 if HZ_600
+	default 750 if HZ_750
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ec1e9ea4bf8..90e639a39cf6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -98,6 +98,10 @@
 #include <linux/bpf.h>
 #include <linux/stackprotector.h>
 
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
+
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -2032,6 +2036,10 @@ static __latent_entropy struct task_struct *copy_process(
 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
+	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
+		if (!capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+
 	/*
 	 * Thread groups must share signals as well, and detached threads
 	 * can only be started up within the thread group.
@@ -3182,6 +3190,12 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
 
+	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
+		err = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto bad_unshare_out;
+	}
+
 	err = check_unshare_flags(unshare_flags);
 	if (err)
 		goto bad_unshare_out;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 424b3bc58f3f..ecf2798c5ccf 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD
 
 endchoice
 
+menu "ZSTD module compression options"
+    depends on MODULE_COMPRESS_ZSTD
+
+config MODULE_COMPRESS_ZSTD_LEVEL
+	int "Compression level (1-19)"
+	range 1 19
+	default 9
+	help
+	  Compression level used by zstd for compressing modules.
+
+config MODULE_COMPRESS_ZSTD_ULTRA
+	bool "Enable ZSTD ultra compression"
+	help
+	  Compress modules with ZSTD using the highest possible compression.
+
+config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA
+	int "Compression level (20-22)"
+	depends on MODULE_COMPRESS_ZSTD_ULTRA
+	range 20 22
+	default 20
+	help
+	  Ultra compression level used by zstd for compressing modules.
+
+endmenu
+
 config MODULE_DECOMPRESS
 	bool "Support in-kernel module decompression"
 	depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed89be0aa650..fce570ff3673 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -69,9 +69,13 @@
  *
  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_latency			= 3000000ULL;
+static unsigned int normalized_sysctl_sched_latency	= 3000000ULL;
+#else
 unsigned int sysctl_sched_latency			= 6000000ULL;
 static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
-
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  *
@@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  *
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_min_granularity			= 400000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity	= 400000ULL;
+#else
 unsigned int sysctl_sched_min_granularity			= 750000ULL;
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
+#endif
 
 /*
  * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
@@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  *
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_wakeup_granularity			= 500000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity	= 500000UL;
+#else
 unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
 static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+#endif
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
@@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
+#ifdef CONFIG_CACHY
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+#else
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
+#endif
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c240d2c99bc..98e1a7472fd2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
 #ifdef CONFIG_PERF_EVENTS
 static const int six_hundred_forty_kb = 640 * 1024;
 #endif
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
 
 
 static const int ngroups_max = NGROUPS_MAX;
@@ -1645,6 +1648,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_USER_NS
+	{
+		.procname	= "unprivileged_userns_clone",
+		.data		= &unprivileged_userns_clone,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..fec01d016a35 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,6 +22,13 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 
+/* sysctl */
+#ifdef CONFIG_USER_NS_UNPRIVILEGED
+int unprivileged_userns_clone = 1;
+#else
+int unprivileged_userns_clone;
+#endif
+
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 4751031f3f05..cf2e47030fe8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -621,7 +621,7 @@ config COMPACTION
 config COMPACT_UNEVICTABLE_DEFAULT
 	int
 	depends on COMPACTION
-	default 0 if PREEMPT_RT
+	default 0 if PREEMPT_RT || CACHY
 	default 1
 
 #
diff --git a/mm/compaction.c b/mm/compaction.c
index 5a9501e0ae01..4d8c63b9cdca 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2735,7 +2735,11 @@ static void compact_nodes(void)
  * aggressively the kernel should compact memory in the
  * background. It takes values in the range [0, 100].
  */
+#ifdef CONFIG_CACHY
+unsigned int __read_mostly sysctl_compaction_proactiveness;
+#else
 unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+#endif
 
 int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 516b1aa247e8..78fb31d27ed7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
+#ifdef CONFIG_CACHY
+static int dirty_background_ratio = 5;
+#else
 static int dirty_background_ratio = 10;
+#endif
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes;
 /*
  * The interval between `kupdate'-style writebacks
  */
+#ifdef CONFIG_CACHY
+unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
+#else
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+#endif
 
 EXPORT_SYMBOL_GPL(dirty_writeback_interval);
 
diff --git a/mm/swap.c b/mm/swap.c
index 423199ee8478..adef27bd3f8b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
  */
 void __init swap_setup(void)
 {
+#ifdef CONFIG_CACHY
+	/* Only swap-in pages requested, avoid readahead */
+	page_cluster = 0;
+#else
 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
 
 	/* Use a smaller cluster for small-memory machines */
@@ -1101,4 +1105,5 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#endif
 }
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index b52644771cc4..11a4b0e3b583 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
  * essence, they are percents: the higher the value, the more number
  * unsuccessful reclaims there were.
  */
+#ifdef CONFIG_CACHY
+static const unsigned int vmpressure_level_med = 65;
+#else
 static const unsigned int vmpressure_level_med = 60;
+#endif
 static const unsigned int vmpressure_level_critical = 95;
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bb7ce0a934a..7f2767a2e821 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -186,7 +186,11 @@ struct scan_control {
 /*
  * From 0 .. 200.  Higher means more swappy.
  */
+#ifdef CONFIG_CACHY
+int vm_swappiness = 20;
+#else
 int vm_swappiness = 60;
+#endif
 
 static void set_task_reclaim_state(struct task_struct *task,
 				   struct reclaim_state *rs)
@@ -4546,7 +4550,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
 }
 
 /* to protect the working set of the last N jiffies */
+#ifdef CONFIG_CACHY
+static unsigned long lru_gen_min_ttl __read_mostly = HZ;
+#else
 static unsigned long lru_gen_min_ttl __read_mostly;
+#endif
 
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 100a386fcd71..a3ec7265fb57 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -542,14 +542,21 @@ quiet_cmd_xzmisc = XZMISC  $@
 # decompression is used, like initramfs decompression, zstd22 should likely not
 # be used because it would require zstd to allocate a 128 MB buffer.
 
+ifdef CONFIG_ZSTD_COMPRESSION_LEVEL
+zstd_comp_val	:= $(CONFIG_ZSTD_COMPRESSION_LEVEL)
+ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0)
+zstd_comp_val	+= --ultra
+endif
+endif
+
 quiet_cmd_zstd = ZSTD    $@
-      cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@
+      cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@
 
 quiet_cmd_zstd22 = ZSTD22  $@
-      cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@
+      cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@
 
 quiet_cmd_zstd22_with_size = ZSTD22  $@
-      cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@
+      cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@
 
 # ASM offsets
 # ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index ab0c5bd1a60f..f4989f706d7f 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP    $@
       cmd_gzip = $(KGZIP) -n -f $<
 quiet_cmd_xz = XZ      $@
       cmd_xz = $(XZ) --lzma2=dict=2MiB -f $<
+ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA
 quiet_cmd_zstd = ZSTD    $@
-      cmd_zstd = $(ZSTD) -T0 --rm -f -q $<
+      cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $<
+else
+quiet_cmd_zstd = ZSTD    $@
+      cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $<
+endif
 
 $(dst)/%.ko.gz: $(dst)/%.ko FORCE
 	$(call cmd,gzip)
-- 
2.41.0

From a7470f9feb7252a6a8b8efd5a585fb5e382c3207 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Thu, 22 Jun 2023 19:55:23 +0200
Subject: [PATCH 04/12] fixes

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/ABI/stable/sysfs-block          |   10 +
 .../testing/sysfs-class-led-trigger-blkdev    |   78 ++
 Documentation/leds/index.rst                  |    1 +
 Documentation/leds/ledtrig-blkdev.rst         |  158 +++
 Documentation/x86/topology.rst                |   26 +
 arch/x86/events/amd/core.c                    |    2 +-
 arch/x86/events/amd/ibs.c                     |   53 +-
 arch/x86/include/asm/cacheinfo.h              |    1 +
 arch/x86/include/asm/perf_event.h             |    2 +
 arch/x86/kernel/acpi/boot.c                   |   11 +-
 arch/x86/kernel/cpu/amd.c                     |    1 +
 arch/x86/kernel/cpu/cacheinfo.c               |   36 +
 arch/x86/kernel/cpu/hygon.c                   |    1 +
 arch/x86/net/bpf_jit_comp.c                   |    5 +-
 drivers/bluetooth/btusb.c                     |    2 +-
 drivers/leds/trigger/Kconfig                  |    9 +
 drivers/leds/trigger/Makefile                 |    1 +
 drivers/leds/trigger/ledtrig-blkdev.c         | 1221 +++++++++++++++++
 fs/eventpoll.c                                |  190 ++-
 include/linux/pageblock-flags.h               |    2 +-
 include/linux/perf_event.h                    |   11 +
 kernel/events/core.c                          |   79 +-
 kernel/padata.c                               |    4 +-
 lib/decompress_inflate.c                      |    2 +-
 lib/decompress_unxz.c                         |    2 +
 lib/decompress_unzstd.c                       |    2 +
 mm/page_alloc.c                               |   22 +-
 scripts/Makefile.vmlinux_o                    |    2 +-
 sound/pci/hda/cs35l41_hda.c                   |    2 +-
 tools/perf/tests/Build                        |    1 +
 tools/perf/tests/amd-ibs-via-core-pmu.c       |   72 +
 tools/perf/tests/builtin-test.c               |    1 +
 tools/perf/tests/tests.h                      |    1 +
 33 files changed, 1851 insertions(+), 160 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
 create mode 100644 Documentation/leds/ledtrig-blkdev.rst
 create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c
 create mode 100644 tools/perf/tests/amd-ibs-via-core-pmu.c

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 282de3680367..ac1dd2fbd855 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -101,6 +101,16 @@ Description:
 		devices that support receiving integrity metadata.
 
 
+What:		/sys/block/<disk>/linked_leds
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Directory that contains symbolic links to all LEDs that
+		are associated with (linked to) this block device by the
+		blkdev LED trigger.  Only present when at least one LED
+		is linked.  (See Documentation/leds/ledtrig-blkdev.rst.)
+
+
 What:		/sys/block/<disk>/<partition>/alignment_offset
 Date:		April 2009
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
new file mode 100644
index 000000000000..28ce8c814fb7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
@@ -0,0 +1,78 @@
+What:		/sys/class/leds/<led>/blink_time
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Time (in milliseconds) that the LED will be on during a single
+		"blink".
+
+What:		/sys/class/leds/<led>/check_interval
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Interval (in milliseconds) between checks of the block devices
+		linked to this LED.  The LED will be blinked if the correct type
+		of activity (see blink_on_{read,write,discard,flush} attributes)
+		has occurred on any of the linked devices since the previous
+		check.
+
+What:		/sys/class/leds/<led>/blink_on_read
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to read activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/blink_on_write
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to write activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/blink_on_discard
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to discard activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/blink_on_flush
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gamil.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to cache flush activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/link_dev_by_path
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Associate a block device with this LED by writing the path to
+		the device special file (e.g. /dev/sda) to this attribute.
+		Symbolic links are followed.
+
+What:		/sys/class/leds/<led>/unlink_dev_by_path
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Remove the association between this LED and a block device by
+		writing the path to the device special file (e.g. /dev/sda) to
+		this attribute.  Symbolic links are followed.
+
+What:		/sys/class/leds/<led>/unlink_dev_by_name
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Remove the association between this LED and a block device by
+		writing the kernel name of the device (e.g. sda) to this
+		attribute.
+
+What:		/sys/class/leds/<led>/linked_devices
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Directory containing links to all block devices that are
+		associated with this LED.  (Note that the names of the
+		symbolic links in this directory are *kernel* names, which
+		may not match the device special file paths written to
+		link_device and unlink_device.)
diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
index b9ca081fac71..5e37d8e7bd28 100644
--- a/Documentation/leds/index.rst
+++ b/Documentation/leds/index.rst
@@ -10,6 +10,7 @@ LEDs
    leds-class
    leds-class-flash
    leds-class-multicolor
+   ledtrig-blkdev
    ledtrig-oneshot
    ledtrig-transient
    ledtrig-usbport
diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst
new file mode 100644
index 000000000000..9ff5b99de451
--- /dev/null
+++ b/Documentation/leds/ledtrig-blkdev.rst
@@ -0,0 +1,158 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================
+Block Device (blkdev) LED Trigger
+=================================
+
+Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or
+``CONFIG_LEDS_TRIGGER_BLKDEV=m``.
+
+See also:
+
+* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev``
+* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block/<disk>/linked_leds``)
+
+Overview
+========
+
+.. note::
+	The examples below use ``<LED>`` to refer to the name of a
+	system-specific LED.  If no suitable LED is available on a test
+	system (in a virtual machine, for example), it is possible to
+	use a userspace LED.  (See ``Documentation/leds/uleds.rst``.)
+
+Verify that the ``blkdev`` LED trigger is available::
+
+	# grep blkdev /sys/class/leds/<LED>/trigger
+	... rfkill-none blkdev
+
+(If the previous command produces no output, you may need to load the trigger
+module - ``modprobe ledtrig_blkdev``.  If the module is not available, check
+the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.)
+
+Associate the LED with the ``blkdev`` LED trigger::
+
+	# echo blkdev > /sys/class/leds/<LED>/trigger
+
+	# cat /sys/class/leds/<LED>/trigger
+	... rfkill-none [blkdev]
+
+Note that several new device attributes are available in the
+``/sys/class/leds/<LED>`` directory.
+
+* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are
+  used to manage the set of block devices associated with this LED.  The LED
+  will blink when activity occurs on any of its linked devices.
+
+* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and
+  ``blink_on_flush`` are boolean values that determine whether the LED will
+  blink when a particular type of activity is detected on one of its linked
+  block devices.
+
+* ``blink_time`` is the duration (in milliseconds) of each blink of this LED.
+  (The minimum value is 10 milliseconds.)
+
+* ``check_interval`` is the frequency (in milliseconds) with which block devices
+  linked to this LED will be checked for activity and the LED blinked (if the
+  correct type of activity has occurred).
+
+* The ``linked_devices`` directory will contain a symbolic link to every device
+  that is associated with this LED.
+
+Link a block device to the LED::
+
+	# echo /dev/sda > /sys/class/leds/<LED>/link_dev_by_path
+
+	# ls /sys/class/leds/<LED>/linked_devices
+	sda
+
+(The value written to ``link_dev_by_path`` must be the path of the device
+special file, such as ``/dev/sda``, that represents the block device - or the
+path of a symbolic link to such a device special file.)
+
+Activity on the device will now cause the LED to blink.  The duration of each
+blink (in milliseconds) can be adjusted by setting
+``/sys/class/leds/<LED>/blink_time``.  (But see **check_interval and
+blink_time** below.)
+
+Associate a second device with the LED::
+
+	# echo /dev/sdb > /sys/class/leds/<LED>/link_dev_by_path
+
+	# ls /sys/class/leds/<LED>/linked_devices
+	sda  sdb
+
+When a block device is linked to one or more LEDs, the LEDs are linked from
+the device's ``linked_leds`` directory::
+
+	# ls /sys/class/block/sd{a,b}/linked_leds
+	/sys/class/block/sda/linked_leds:
+	<LED>
+
+	/sys/class/block/sdb/linked_leds:
+	<LED>
+
+(The ``linked_leds`` directory only exists when the block device is linked to
+at least one LED.)
+
+``check_interval`` and ``blink_time``
+=====================================
+
+* By default, linked block devices are checked for activity every 100
+  milliseconds.  This frequency can be changed for an LED via the
+  ``/sys/class/leds/<led>/check_interval`` attribute.  (The minimum value is 25
+  milliseconds.)
+
+* All block devices associated with an LED are checked for activity every
+  ``check_interval`` milliseconds, and a blink is triggered if the correct type
+  of activity (as determined by the LED's ``blink_on_*`` attributes) is
+  detected.  The duration of an LED's blink is determined by its ``blink_time``
+  attribute.  Thus (when the correct type of activity is detected), the LED will
+  be on for ``blink_time`` milliseconds and off for
+  ``check_interval - blink_time`` milliseconds.
+
+* The LED subsystem ignores new blink requests for an LED that is already in
+  in the process of blinking, so setting a ``blink_time`` greater than or equal
+  to ``check_interval`` will cause some blinks to be missed.
+
+* Because of processing times, scheduling latencies, etc., avoiding missed
+  blinks actually requires a difference of at least a few milliseconds between
+  the ``blink_time`` and ``check_interval``.  The required difference is likely
+  to vary from system to system.  As a  reference, a Thecus N5550 NAS requires a
+  difference of 7 milliseconds (e.g. ``check_interval == 100``,
+  ``blink_time == 93``).
+
+* The default values (``check_interval == 100``, ``blink_time == 75``) cause the
+  LED associated with a continuously active device to blink rapidly.  For a more
+  "always on" effect, increase the ``blink_time`` (but not too much; see the
+  previous bullet).
+
+Other Notes
+===========
+
+* Many (possibly all) types of block devices work with this trigger, including:
+
+  * SCSI (including SATA and USB) hard disk drives and SSDs
+  * SCSI (including SATA and USB) optical drives
+  * NVMe SSDs
+  * SD cards
+  * loopback block devices (``/dev/loop*``)
+  * device mapper devices, such as LVM logical volumes
+  * MD RAID devices
+  * zRAM compressed RAM-disks
+  * partitions on block devices that support them
+
+* The names of the symbolic links in ``/sys/class/leds/<LED>/linked_devices``
+  are **kernel** names, which may not match the paths used for
+  ``link_dev_by_path`` and ``unlink_dev_by_path``.  This is most likely when a
+  symbolic link is used to refer to the device (as is common with logical
+  volumes), but it can be true for any device, because nothing prevents the
+  creation of device special files with arbitrary names (e.g.
+  ``sudo mknod /foo b 8 0``).
+
+  Kernel names can be used to unlink block devices from LEDs by writing them to
+  the LED's ``unlink_dev_by_name`` attribute.
+
+* The ``blkdev`` LED trigger supports many-to-many device/LED associations.
+  A device can be associated with multiple LEDs, and an LED can be associated
+  with multiple devices.
diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst
index 7f58010ea86a..9de14f3f7783 100644
--- a/Documentation/x86/topology.rst
+++ b/Documentation/x86/topology.rst
@@ -33,6 +33,7 @@ historical nature and should be cleaned up.
 The topology of a system is described in the units of:
 
     - packages
+    - cluster
     - cores
     - threads
 
@@ -90,6 +91,22 @@ Package-related topology information in the kernel:
         Cache. In general, it is a number identifying an LLC uniquely on the
         system.
 
+Clusters
+========
+A cluster consists of threads of one or more cores sharing the same L2 cache.
+
+Cluster-related topology information in the kernel:
+
+  - cluster_id:
+
+    A per-CPU variable containing:
+
+      - Upper bits extracted from the APIC ID.  CPUs which have the same value
+        in these bits share an L2 and have the same cluster_id.
+
+        CPUs for which cluster information is unavailable will show 65535
+        (BAD_APICID) as the cluster_id.
+
 Cores
 =====
 A core consists of 1 or more threads. It does not matter whether the threads
@@ -125,6 +142,11 @@ Thread-related topology information in the kernel:
 
     The number of online threads is also printed in /proc/cpuinfo "siblings."
 
+  - topology_cluster_cpumask():
+
+    The cpumask contains all online threads in the cluster to which a thread
+    belongs.
+
   - topology_sibling_cpumask():
 
     The cpumask contains all online threads in the core to which a thread
@@ -138,6 +160,10 @@ Thread-related topology information in the kernel:
 
     The physical package ID to which a thread belongs.
 
+  - topology_cluster_id();
+
+    The ID of the cluster to which a thread belongs.
+
   - topology_core_id();
 
     The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index bccea57dee81..abadd5f23425 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -374,7 +374,7 @@ static int amd_pmu_hw_config(struct perf_event *event)
 
 	/* pass precise event sampling to ibs: */
 	if (event->attr.precise_ip && get_ibs_caps())
-		return -ENOENT;
+		return forward_event_to_ibs(event);
 
 	if (has_branch_stack(event) && !x86_pmu.lbr_nr)
 		return -EOPNOTSUPP;
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 64582954b5f6..371014802191 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -190,7 +190,7 @@ static struct perf_ibs *get_ibs_pmu(int type)
 }
 
 /*
- * Use IBS for precise event sampling:
+ * core pmu config -> IBS config
  *
  *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
  *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
@@ -199,25 +199,9 @@ static struct perf_ibs *get_ibs_pmu(int type)
  * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
  * MSRC001_1033) is used to select either cycle or micro-ops counting
  * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
  */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+static int core_pmu_ibs_config(struct perf_event *event, u64 *config)
 {
-	switch (event->attr.precise_ip) {
-	case 0:
-		return -ENOENT;
-	case 1:
-	case 2:
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		switch (event->attr.config) {
@@ -243,22 +227,37 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
 	return -EOPNOTSUPP;
 }
 
+/*
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ */
+int forward_event_to_ibs(struct perf_event *event)
+{
+	u64 config = 0;
+
+	if (!event->attr.precise_ip || event->attr.precise_ip > 2)
+		return -EOPNOTSUPP;
+
+	if (!core_pmu_ibs_config(event, &config)) {
+		event->attr.type = perf_ibs_op.pmu.type;
+		event->attr.config = config;
+	}
+	return -ENOENT;
+}
+
 static int perf_ibs_init(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_ibs *perf_ibs;
 	u64 max_cnt, config;
-	int ret;
 
 	perf_ibs = get_ibs_pmu(event->attr.type);
-	if (perf_ibs) {
-		config = event->attr.config;
-	} else {
-		perf_ibs = &perf_ibs_op;
-		ret = perf_ibs_precise_event(event, &config);
-		if (ret)
-			return ret;
-	}
+	if (!perf_ibs)
+		return -ENOENT;
+
+	config = event->attr.config;
 
 	if (event->pmu != &perf_ibs->pmu)
 		return -ENOENT;
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index ce9685fc78d8..2034cd556c07 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -7,6 +7,7 @@ extern unsigned int memory_caching_control;
 #define CACHE_MTRR 0x01
 #define CACHE_PAT  0x02
 
+void cacheinfo_topoext_init_l2c_id(struct cpuinfo_x86 *c, int cpu);
 void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu);
 void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index abf09882f58b..f1a46500a275 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -478,8 +478,10 @@ struct pebs_xmm {
 
 #ifdef CONFIG_X86_LOCAL_APIC
 extern u32 get_ibs_caps(void);
+extern int forward_event_to_ibs(struct perf_event *event);
 #else
 static inline u32 get_ibs_caps(void) { return 0; }
+static inline int forward_event_to_ibs(struct perf_event *event) { return -ENOENT; }
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0dac4ab5b55b..21b542a6866c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1858,13 +1858,18 @@ early_param("acpi_sci", setup_acpi_sci);
 
 int __acpi_acquire_global_lock(unsigned int *lock)
 {
-	unsigned int old, new;
+	unsigned int old, new, val;
 
 	old = READ_ONCE(*lock);
 	do {
-		new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
+		val = (old >> 1) & 0x1;
+		new = (old & ~0x3) + 2 + val;
 	} while (!try_cmpxchg(lock, &old, new));
-	return ((new & 0x3) < 3) ? -1 : 0;
+
+	if (val)
+		return 0;
+
+	return -1;
 }
 
 int __acpi_release_global_lock(unsigned int *lock)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 95cdd08c4cbb..d6594727f924 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -358,6 +358,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		if (!err)
 			c->x86_coreid_bits = get_count_order(c->x86_max_cores);
 
+		cacheinfo_topoext_init_l2c_id(c, cpu);
 		cacheinfo_amd_init_llc_id(c, cpu);
 
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 4063e8991211..947a1f27278c 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -659,6 +659,42 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 	return i;
 }
 
+void cacheinfo_topoext_init_l2c_id(struct cpuinfo_x86 *c, int cpu)
+{
+	u32 eax, ebx, ecx, edx, num_sharing_cache;
+	int i = 0, bits;
+
+	/* Check if L2 cache identifiers exists. */
+	if (!cpuid_ecx(0x80000006))
+		return;
+
+	while (true) {
+		u32 level;
+
+		cpuid_count(0x8000001d, i, &eax, &ebx, &ecx, &edx);
+		if (!eax)
+			return;
+
+		/*
+		 * Check if the current leaf is for L2 cache using
+		 * eax[7:5] used to describe the cache level.
+		 */
+		level = (eax >> 5) & 0x7;
+		if (level == 2)
+			break;
+
+		++i;
+	}
+
+	/*
+	 * L2 ID is calculated from the number of threads
+	 * sharing the L2 cache.
+	 */
+	num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
+	bits = get_count_order(num_sharing_cache);
+	per_cpu(cpu_l2c_id, cpu) = c->apicid >> bits;
+}
+
 void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
 {
 	/*
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 5a2962c492d3..cb0025b4a2fd 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -89,6 +89,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
 		/* Socket ID is ApicId[6] for these processors. */
 		c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
 
+		cacheinfo_topoext_init_l2c_id(c, cpu);
 		cacheinfo_hygon_init_llc_id(c, cpu);
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 1056bbf55b17..212bfd1517ec 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip)
 
 static int emit_rsb_call(u8 **pprog, void *func, void *ip)
 {
+	void *adjusted_ip;
 	OPTIMIZER_HIDE_VAR(func);
-	x86_call_depth_emit_accounting(pprog, func);
-	return emit_patch(pprog, func, ip, 0xE8);
+	adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func);
+	return emit_patch(pprog, func, adjusted_ip, 0xE8);
 }
 
 static int emit_jump(u8 **pprog, void *func, void *ip)
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 0923582299f3..d40de705b9ed 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -915,7 +915,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
 		}
 
 		gpiod_set_value_cansleep(reset_gpio, 0);
-		msleep(200);
+		usleep_range(USEC_PER_SEC / 2, USEC_PER_SEC);
 		gpiod_set_value_cansleep(reset_gpio, 1);
 
 		return;
diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
index dc6816d36d06..bda249068182 100644
--- a/drivers/leds/trigger/Kconfig
+++ b/drivers/leds/trigger/Kconfig
@@ -154,4 +154,13 @@ config LEDS_TRIGGER_TTY
 
 	  When build as a module this driver will be called ledtrig-tty.
 
+config LEDS_TRIGGER_BLKDEV
+	tristate "LED Trigger for block devices"
+	depends on BLOCK
+	help
+	  The blkdev LED trigger allows LEDs to be controlled by block device
+	  activity (reads and writes).
+
+	  See Documentation/leds/ledtrig-blkdev.rst.
+
 endif # LEDS_TRIGGERS
diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile
index 25c4db97cdd4..d53bab5d93f1 100644
--- a/drivers/leds/trigger/Makefile
+++ b/drivers/leds/trigger/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV)	+= ledtrig-netdev.o
 obj-$(CONFIG_LEDS_TRIGGER_PATTERN)	+= ledtrig-pattern.o
 obj-$(CONFIG_LEDS_TRIGGER_AUDIO)	+= ledtrig-audio.o
 obj-$(CONFIG_LEDS_TRIGGER_TTY)		+= ledtrig-tty.o
+obj-$(CONFIG_LEDS_TRIGGER_BLKDEV)	+= ledtrig-blkdev.o
diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c
new file mode 100644
index 000000000000..067eedb003b5
--- /dev/null
+++ b/drivers/leds/trigger/ledtrig-blkdev.c
@@ -0,0 +1,1221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ *	Block device LED trigger
+ *
+ *	Copyright 2021-2022 Ian Pilcher <arequipeno@gmail.com>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/part_stat.h>
+#include <linux/xarray.h>
+
+/**
+ * DOC: Overview
+ *
+ * The ``blkdev`` LED trigger works by periodically checking the activity
+ * counters of block devices that have been linked to one or more LEDs and
+ * blinking those LED(s) if the correct type of activity has occurred.  The
+ * periodic check is scheduled with the Linux kernel's deferred work facility.
+ *
+ * Trigger-specific data about block devices and LEDs is stored in two data
+ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led
+ * (a "BTL").  Each structure contains a &struct xarray that holds links to any
+ * linked devices of the other type.  I.e. &blkdev_trig_bdev.linked_btls
+ * contains links to all BTLs whose LEDs have been linked to the BTB's block
+ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose
+ * block devices have been linked to the BTL's LED.  Thus, a block device can
+ * be linked to more than one LED, and an LED can be linked to more than one
+ * block device.
+ */
+
+/* Default, minimum & maximum blink duration (milliseconds) */
+#define BLKDEV_TRIG_BLINK_DEF	75
+#define BLKDEV_TRIG_BLINK_MIN	10
+#define BLKDEV_TRIG_BLINK_MAX	86400000  /* 24 hours */
+
+/* Default, minimum & maximum activity check interval (milliseconds) */
+#define BLKDEV_TRIG_CHECK_DEF	100
+#define BLKDEV_TRIG_CHECK_MIN	25
+#define BLKDEV_TRIG_CHECK_MAX	86400000  /* 24 hours */
+
+/*
+ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying
+ * again (milliseconds)
+ */
+#define BLKDEV_TRIG_CHECK_RETRY	5
+
+/* Mode argument for calls to blkdev_get_by_path() and blkdev_put() */
+#define BLKDEV_TRIG_FMODE	0
+
+/**
+ * struct blkdev_trig_bdev - Trigger-specific data about a block device.
+ * @last_checked:	Time (in jiffies) at which the trigger last checked this
+ *			block device for activity.
+ * @last_activity:	Time (in jiffies) at which the trigger last detected
+ *			activity of each type.
+ * @ios:		Activity counter values for each type, corresponding to
+ *			the timestamps in &last_activity.
+ * @index:		&xarray index, so the BTB can be included in one or more
+ *			&blkdev_trig_led.linked_btbs.
+ * @bdev:		The block device.
+ * @linked_btls:	The BTLs that represent the LEDs linked to the BTB's
+ *			block device.
+ *
+ * Every block device linked to at least one LED gets a "BTB."  A BTB is created
+ * when a block device that is not currently linked to any LEDs is linked to an
+ * LED.
+ *
+ * A BTB is freed when one of the following occurs:
+ *
+ * * The number of LEDs linked to the block device becomes zero, because it has
+ *   been unlinked from its last LED using the trigger's &sysfs interface.
+ *
+ * * The number of LEDs linked to the block device becomes zero, because the
+ *   last LED to which it was linked has been disassociated from the trigger
+ *   (which happens automatically if the LED device is removed from the system).
+ *
+ * * The BTB's block device is removed from the system.  To accomodate this
+ *   scenario, BTB's are created as device resources, so that the release
+ *   function will be called by the driver core when the device is removed.
+ */
+struct blkdev_trig_bdev {
+	unsigned long		last_checked;
+	unsigned long		last_activity[NR_STAT_GROUPS];
+	unsigned long		ios[NR_STAT_GROUPS];
+	unsigned long		index;
+	struct block_device	*bdev;
+	struct xarray		linked_btls;
+};
+
+/**
+ * struct blkdev_trig_led - Trigger-specific data about an LED.
+ * @last_checked:	Time (in jiffies) at which the trigger last checked the
+ *			the block devices linked to this LED for activity.
+ * @index:		&xarray index, so the BTL can be included in one or more
+ *			&blkdev_trig_bdev.linked_btls.
+ * @mode:		Bitmask for types of block device activity that will
+ *			cause this LED to blink --- reads, writes, discards,
+ *			etc.
+ * @led:		The LED device.
+ * @blink_msec:		Duration of a blink (milliseconds).
+ * @check_jiffies:	Frequency with which block devices linked to this LED
+ *			should be checked for activity (jiffies).
+ * @linked_btbs:	The BTBs that represent the block devices linked to the
+ *			BTL's LED.
+ * @all_btls_node:	The BTL's node in the module's list of all BTLs.
+ *
+ * Every LED associated with the block device trigger gets a "BTL."  A BTL is
+ * created when the trigger is "activated" on an LED (usually by writing
+ * ``blkdev`` to the LED's &sysfs &trigger attribute).  A BTL is freed wnen its
+ * LED is disassociated from the trigger, either through the trigger's &sysfs
+ * interface or because the LED device is removed from the system.
+ */
+struct blkdev_trig_led {
+	unsigned long		last_checked;
+	unsigned long		index;
+	unsigned long		mode;  /* must be ulong for atomic bit ops */
+	struct led_classdev	*led;
+	unsigned int		blink_msec;
+	unsigned int		check_jiffies;
+	struct xarray		linked_btbs;
+	struct hlist_node	all_btls_node;
+};
+
+/* Protects everything except atomic LED attributes */
+static DEFINE_MUTEX(blkdev_trig_mutex);
+
+/* BTB device resource release function */
+static void blkdev_trig_btb_release(struct device *dev, void *res);
+
+/* Index for next BTB or BTL */
+static unsigned long blkdev_trig_next_index;
+
+/* All LEDs associated with the trigger */
+static HLIST_HEAD(blkdev_trig_all_btls);
+
+/* Delayed work to periodically check for activity & blink LEDs */
+static void blkdev_trig_check(struct work_struct *work);
+static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check);
+
+/* When is the delayed work scheduled to run next (jiffies) */
+static unsigned long blkdev_trig_next_check;
+
+/* Total number of BTB-to-BTL links */
+static unsigned int blkdev_trig_link_count;
+
+/* Empty sysfs attribute list for next 2 declarations */
+static struct attribute *blkdev_trig_attrs_empty[] = { NULL };
+
+/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */
+static const struct attribute_group blkdev_trig_linked_leds = {
+	.name	= "linked_leds",
+	.attrs	= blkdev_trig_attrs_empty,
+};
+
+/* linked_devices sysfs directory for each LED associated with the trigger */
+static const struct attribute_group blkdev_trig_linked_devs = {
+	.name	= "linked_devices",
+	.attrs	= blkdev_trig_attrs_empty,
+};
+
+
+/*
+ *
+ *	Delayed work to check for activity & blink LEDs
+ *
+ */
+
+/**
+ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has
+ *	occurred on the block device.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ * Return:	&true if the LED is blinked, &false if not.
+ */
+static bool blkdev_trig_blink(const struct blkdev_trig_led *btl,
+			      const struct blkdev_trig_bdev *btb)
+{
+	unsigned long mode, mask, delay_on, delay_off;
+	enum stat_group i;
+
+	mode = READ_ONCE(btl->mode);
+
+	for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) {
+
+		if (!(mode & mask))
+			continue;
+
+		if (time_before_eq(btb->last_activity[i], btl->last_checked))
+			continue;
+
+		delay_on = READ_ONCE(btl->blink_msec);
+		delay_off = 1;	/* 0 leaves LED turned on */
+
+		led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0);
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps.
+ * @btb:	The BTB
+ * @now:	Timestamp (in jiffies)
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb,
+				   unsigned long now)
+{
+	unsigned long new_ios;
+	enum stat_group i;
+
+	for (i = STAT_READ; i <= STAT_FLUSH; ++i) {
+
+		new_ios = part_stat_read(btb->bdev, ios[i]);
+
+		if (new_ios != btb->ios[i]) {
+			btb->ios[i] = new_ios;
+			btb->last_activity[i] = now;
+		}
+	}
+
+	btb->last_checked = now;
+}
+
+/**
+ * blkdev_trig_check() - Check linked devices for activity and blink LEDs.
+ * @work:	Delayed work (&blkdev_trig_work)
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ */
+static void blkdev_trig_check(struct work_struct *work)
+{
+	struct blkdev_trig_led *btl;
+	struct blkdev_trig_bdev *btb;
+	unsigned long index, delay, now, led_check, led_delay;
+	bool blinked;
+
+	if (!mutex_trylock(&blkdev_trig_mutex)) {
+		delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY);
+		goto exit_reschedule;
+	}
+
+	now = jiffies;
+	delay = ULONG_MAX;
+
+	hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) {
+
+		led_check = btl->last_checked + btl->check_jiffies;
+
+		if (time_before_eq(led_check, now)) {
+
+			blinked = false;
+
+			xa_for_each (&btl->linked_btbs, index, btb) {
+
+				if (btb->last_checked != now)
+					blkdev_trig_update_btb(btb, now);
+				if (!blinked)
+					blinked = blkdev_trig_blink(btl, btb);
+			}
+
+			btl->last_checked = now;
+			led_delay = btl->check_jiffies;
+
+		} else {
+			led_delay = led_check - now;
+		}
+
+		if (led_delay < delay)
+			delay = led_delay;
+	}
+
+	mutex_unlock(&blkdev_trig_mutex);
+
+exit_reschedule:
+	WARN_ON_ONCE(delay == ULONG_MAX);
+	WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay));
+}
+
+/**
+ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new
+ *	LED is added to the schedule.
+ * @btl:	The BTL that represents the LED
+ *
+ * Called when the number of block devices to which an LED is linked becomes
+ * non-zero.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl)
+{
+	unsigned long delay = READ_ONCE(btl->check_jiffies);
+	unsigned long check_by = jiffies + delay;
+
+	/*
+	 * If no other LED-to-block device links exist, simply schedule the
+	 * delayed work according to this LED's check_interval attribute
+	 * (check_jiffies).
+	 */
+	if (blkdev_trig_link_count == 0) {
+		WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay));
+		blkdev_trig_next_check = check_by;
+		return;
+	}
+
+	/*
+	 * If the next check is already scheduled to occur soon enough to
+	 * accomodate this LED's check_interval, the schedule doesn't need
+	 * to be changed.
+	 */
+	if (time_after_eq(check_by, blkdev_trig_next_check))
+		return;
+
+	/*
+	 * Modify the schedule, so that the delayed work runs soon enough for
+	 * this LED.
+	 */
+	WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay));
+	blkdev_trig_next_check = check_by;
+}
+
+
+/*
+ *
+ *	Linking and unlinking LEDs and block devices
+ *
+ */
+
+/**
+ * blkdev_trig_link() - Link a block device to an LED.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ * Return:	&0 on success, negative &errno on error.
+ */
+static int blkdev_trig_link(struct blkdev_trig_led *btl,
+			    struct blkdev_trig_bdev *btb)
+{
+	bool led_first_link;
+	int err;
+
+	led_first_link = xa_empty(&btl->linked_btbs);
+
+	err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL);
+	if (err)
+		return err;
+
+	err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL);
+	if (err)
+		goto error_erase_btl;
+
+	/* Create /sys/class/block/<bdev>/linked_leds/<led> symlink */
+	err = sysfs_add_link_to_group(bdev_kobj(btb->bdev),
+				      blkdev_trig_linked_leds.name,
+				      &btl->led->dev->kobj, btl->led->name);
+	if (err)
+		goto error_erase_btb;
+
+	/* Create /sys/class/leds/<led>/linked_devices/<bdev> symlink */
+	err = sysfs_add_link_to_group(&btl->led->dev->kobj,
+				      blkdev_trig_linked_devs.name,
+				      bdev_kobj(btb->bdev),
+				      dev_name(&btb->bdev->bd_device));
+	if (err)
+		goto error_remove_symlink;
+
+	/*
+	 * If this is the first block device linked to this LED, the delayed
+	 * work schedule may need to be changed.
+	 */
+	if (led_first_link)
+		blkdev_trig_sched_led(btl);
+
+	++blkdev_trig_link_count;
+
+	return 0;
+
+error_remove_symlink:
+	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
+				     blkdev_trig_linked_leds.name,
+				     btl->led->name);
+error_erase_btb:
+	xa_erase(&btl->linked_btbs, btb->index);
+error_erase_btl:
+	xa_erase(&btb->linked_btls, btl->index);
+	return err;
+}
+
+/**
+ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed.
+ * @btb:	The BTB
+ *
+ * Does nothing if the BTB (block device) is still linked to at least one LED.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb)
+{
+	struct block_device *bdev = btb->bdev;
+	int err;
+
+	if (xa_empty(&btb->linked_btls)) {
+
+		sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
+		err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release,
+				     NULL, NULL);
+		WARN_ON(err);
+	}
+}
+
+/**
+ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of
+ *	unlinking a block device from an LED.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * When a block device is unlinked from an LED, certain steps must be performed
+ * only if the block device is **not** being released.  This function performs
+ * those steps that are **always** required, whether or not the block device is
+ * being released.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl,
+				       struct blkdev_trig_bdev *btb)
+{
+	--blkdev_trig_link_count;
+
+	if (blkdev_trig_link_count == 0)
+		WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work));
+
+	xa_erase(&btb->linked_btls, btl->index);
+	xa_erase(&btl->linked_btbs, btb->index);
+
+	/* Remove /sys/class/leds/<led>/linked_devices/<bdev> symlink */
+	sysfs_remove_link_from_group(&btl->led->dev->kobj,
+				     blkdev_trig_linked_devs.name,
+				     dev_name(&btb->bdev->bd_device));
+}
+
+/**
+ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is
+ *	**not** being released.
+ * @btl:	The BTL that represents the LED.
+ * @btb:	The BTB that represents the block device.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl,
+					 struct blkdev_trig_bdev *btb)
+{
+	_blkdev_trig_unlink_always(btl, btb);
+
+	/* Remove /sys/class/block/<bdev>/linked_leds/<led> symlink */
+	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
+				     blkdev_trig_linked_leds.name,
+				     btl->led->name);
+
+	blkdev_trig_put_btb(btb);
+}
+
+/**
+ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is
+ *	being released.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl,
+				       struct blkdev_trig_bdev *btb)
+{
+	_blkdev_trig_unlink_always(btl, btb);
+
+	/*
+	 * If the BTB is being released, the driver core has already removed the
+	 * device's attribute groups, and the BTB will be freed automatically,
+	 * so there's nothing else to do.
+	 */
+}
+
+
+/*
+ *
+ *	BTB creation
+ *
+ */
+
+/**
+ * blkdev_trig_btb_release() - BTB device resource release function.
+ * @dev:	The block device
+ * @res:	The BTB
+ *
+ * Called by the driver core when a block device with a BTB is removed.
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ */
+static void blkdev_trig_btb_release(struct device *dev, void *res)
+{
+	struct blkdev_trig_bdev *btb = res;
+	struct blkdev_trig_led *btl;
+	unsigned long index;
+
+	mutex_lock(&blkdev_trig_mutex);
+
+	xa_for_each (&btb->linked_btls, index, btl)
+		blkdev_trig_unlink_release(btl, btb);
+
+	mutex_unlock(&blkdev_trig_mutex);
+}
+
+/**
+ * blkdev_trig_get_bdev() - Get a block device by path.
+ * @path:	The value written to an LED's &link_dev_by_path or
+ *		&unlink_dev_by_path attribute, which should be the path to a
+ *		special file that represents a block device
+ * @len:	The number of characters in &path (not including its
+ *		terminating null)
+ *
+ * The caller must call blkdev_put() when finished with the device.
+ *
+ * Context:	Process context.
+ * Return:	The block device, or an error pointer.
+ */
+static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len)
+{
+	struct block_device *bdev;
+	char *buf;
+
+	buf = kmemdup(path, len + 1, GFP_KERNEL);  /* +1 to include null */
+	if (buf == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	bdev = blkdev_get_by_path(strim(buf), BLKDEV_TRIG_FMODE, THIS_MODULE);
+	kfree(buf);
+	return bdev;
+}
+
+/**
+ * blkdev_trig_get_btb() - Find or create the BTB for a block device.
+ * @path:	The value written to an LED's &link_dev_by_path attribute,
+ *		which should be the path to a special file that represents a
+ *		block device
+ * @len:	The number of characters in &path
+ *
+ * If a new BTB is created, because the block device was not previously linked
+ * to any LEDs, the block device's &linked_leds &sysfs directory is created.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ * Return:	Pointer to the BTB, error pointer on error.
+ */
+static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path,
+						    size_t len)
+{
+	struct block_device *bdev;
+	struct blkdev_trig_bdev *btb;
+	int err;
+
+	bdev = blkdev_trig_get_bdev(path, len);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
+			  NULL, NULL);
+	if (btb != NULL) {
+		err = 0;
+		goto exit_put_bdev;
+	}
+
+	if (blkdev_trig_next_index == ULONG_MAX) {
+		err = -EOVERFLOW;
+		goto exit_put_bdev;
+	}
+
+	btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL);
+	if (btb == NULL) {
+		err = -ENOMEM;
+		goto exit_put_bdev;
+	}
+
+	err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
+	if (err)
+		goto exit_free_btb;
+
+	btb->index = blkdev_trig_next_index++;
+	btb->bdev = bdev;
+	xa_init(&btb->linked_btls);
+
+	/* Populate BTB activity counters */
+	blkdev_trig_update_btb(btb, jiffies);
+
+	devres_add(&bdev->bd_device, btb);
+
+exit_free_btb:
+	if (err)
+		devres_free(btb);
+exit_put_bdev:
+	blkdev_put(bdev, BLKDEV_TRIG_FMODE);
+	return err ? ERR_PTR(err) : btb;
+}
+
+
+/*
+ *
+ *	Activating and deactivating the trigger on an LED
+ *
+ */
+
+/**
+ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is
+ *	associated with the trigger.
+ * @led:	The LED
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&0 on success, negative &errno on error.
+ */
+static int blkdev_trig_activate(struct led_classdev *led)
+{
+	struct blkdev_trig_led *btl;
+	int err;
+
+	btl = kzalloc(sizeof(*btl), GFP_KERNEL);
+	if (btl == NULL)
+		return -ENOMEM;
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		goto exit_free;
+
+	if (blkdev_trig_next_index == ULONG_MAX) {
+		err = -EOVERFLOW;
+		goto exit_unlock;
+	}
+
+	btl->index = blkdev_trig_next_index++;
+	btl->last_checked = jiffies;
+	btl->mode = -1;  /* set all bits */
+	btl->led = led;
+	btl->blink_msec = BLKDEV_TRIG_BLINK_DEF;
+	btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF);
+	xa_init(&btl->linked_btbs);
+
+	hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls);
+	led_set_trigger_data(led, btl);
+
+exit_unlock:
+	mutex_unlock(&blkdev_trig_mutex);
+exit_free:
+	if (err)
+		kfree(btl);
+	return err;
+}
+
+/**
+ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is
+ *	disassociated from the trigger.
+ * @led:	The LED
+ *
+ * The LEDs subsystem also calls this function when an LED associated with the
+ * trigger is removed or when the trigger is unregistered (if the module is
+ * unloaded).
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ */
+static void blkdev_trig_deactivate(struct led_classdev *led)
+{
+	struct blkdev_trig_led *btl = led_get_trigger_data(led);
+	struct blkdev_trig_bdev *btb;
+	unsigned long index;
+
+	mutex_lock(&blkdev_trig_mutex);
+
+	xa_for_each (&btl->linked_btbs, index, btb)
+		blkdev_trig_unlink_norelease(btl, btb);
+
+	hlist_del(&btl->all_btls_node);
+	kfree(btl);
+
+	mutex_unlock(&blkdev_trig_mutex);
+}
+
+
+/*
+ *
+ *	Link-related attribute store functions
+ *
+ */
+
+/**
+ * link_dev_by_path_store() - &link_dev_by_path device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &link_dev_by_path attribute (&dev_attr_link_dev_by_path)
+ * @buf:	The value written to the attribute, which should be the path to
+ *		a special file that represents a block device to be linked to
+ *		the LED (e.g. ``/dev/sda``)
+ * @count:	The number of characters in &buf
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t link_dev_by_path_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	struct blkdev_trig_bdev *btb;
+	int err;
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		return err;
+
+	btb = blkdev_trig_get_btb(buf, count);
+	if (IS_ERR(btb)) {
+		err = PTR_ERR(btb);
+		goto exit_unlock;
+	}
+
+	if (xa_load(&btb->linked_btls, btl->index) != NULL) {
+		err = -EEXIST;
+		goto exit_put_btb;
+	}
+
+	err = blkdev_trig_link(btl, btb);
+
+exit_put_btb:
+	if (err)
+		blkdev_trig_put_btb(btb);
+exit_unlock:
+	mutex_unlock(&blkdev_trig_mutex);
+	return err ? : count;
+}
+
+/**
+ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store
+ *	function.
+ * @dev:	The LED device
+ * @attr:	The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path)
+ * @buf:	The value written to the attribute, which should be the path to
+ *		a special file that represents a block device to be unlinked
+ *		from the LED (e.g. ``/dev/sda``)
+ * @count:	The number of characters in &buf
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t unlink_dev_by_path_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	struct block_device *bdev;
+	struct blkdev_trig_bdev *btb;
+	int err;
+
+	bdev = blkdev_trig_get_bdev(buf, count);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		goto exit_put_bdev;
+
+	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
+			  NULL, NULL);
+	if (btb == NULL) {
+		err = -EUNATCH;  /* bdev isn't linked to any LED */
+		goto exit_unlock;
+	}
+
+	if (xa_load(&btb->linked_btls, btl->index) == NULL) {
+		err = -EUNATCH;  /* bdev isn't linked to this LED */
+		goto exit_unlock;
+	}
+
+	blkdev_trig_unlink_norelease(btl, btb);
+
+exit_unlock:
+	mutex_unlock(&blkdev_trig_mutex);
+exit_put_bdev:
+	blkdev_put(bdev, BLKDEV_TRIG_FMODE);
+	return err ? : count;
+}
+
+/**
+ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store
+ *	function.
+ * @dev:	The LED device
+ * @attr:	The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name)
+ * @buf:	The value written to the attribute, which should be the kernel
+ *		name of a block device to be unlinked from the LED (e.g.
+ *		``sda``)
+ * @count:	The number of characters in &buf
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t unlink_dev_by_name_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	struct blkdev_trig_bdev *btb;
+	unsigned long index;
+	int err;
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		return err;
+
+	err = -EUNATCH;
+
+	xa_for_each (&btl->linked_btbs, index, btb) {
+
+		if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) {
+			blkdev_trig_unlink_norelease(btl, btb);
+			err = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&blkdev_trig_mutex);
+	return err ? : count;
+}
+
+
+/*
+ *
+ *	Atomic attribute show & store functions
+ *
+ */
+
+/**
+ * blink_time_show() - &blink_time device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
+ * @buf:	Output buffer
+ *
+ * Writes the value of &blkdev_trig_led.blink_msec to &buf.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_time_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec));
+}
+
+/**
+ * blink_time_store() - &blink_time device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets &blkdev_trig_led.blink_msec to the value in &buf.
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_time_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	unsigned int value;
+	int err;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX)
+		return -ERANGE;
+
+	WRITE_ONCE(btl->blink_msec, value);
+	return count;
+}
+
+/**
+ * check_interval_show() - &check_interval device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
+ * @buf:	Output buffer
+ *
+ * Writes the value of &blkdev_trig_led.check_jiffies (converted to
+ * milliseconds) to &buf.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t check_interval_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%u\n",
+			  jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
+}
+
+/**
+ * check_interval_store() - &check_interval device attribute store function
+ * @dev:	The LED device
+ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting
+ * from milliseconds).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t check_interval_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct blkdev_trig_led *led = led_trigger_get_drvdata(dev);
+	unsigned int value;
+	int err;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX)
+		return -ERANGE;
+
+	WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value));
+
+	return count;
+}
+
+/**
+ * blkdev_trig_mode_show() - Helper for boolean attribute show functions.
+ * @led:	The LED
+ * @buf:	Output buffer
+ * @bit:	Which bit to show
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf,
+				 enum stat_group bit)
+{
+	return sysfs_emit(buf,
+			  READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
+}
+
+/**
+ * blkdev_trig_mode_store() - Helper for boolean attribute store functions.
+ * @led:	The LED
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ * @bit:	Which bit to set
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static int blkdev_trig_mode_store(struct blkdev_trig_led *led,
+				  const char *buf, size_t count,
+				  enum stat_group bit)
+{
+	bool set;
+	int err;
+
+	err = kstrtobool(buf, &set);
+	if (err)
+		return err;
+
+	if (set)
+		set_bit(bit, &led->mode);
+	else
+		clear_bit(bit, &led->mode);
+
+	return count;
+}
+
+/**
+ * blink_on_read_show() - &blink_on_read device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in
+ * &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_read_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_READ);
+}
+
+/**
+ * blink_on_read_store() - &blink_on_read device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_read_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_READ);
+}
+
+/**
+ * blink_on_write_show() - &blink_on_write device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in
+ * in &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_write_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_WRITE);
+}
+
+/**
+ * blink_on_write_store() - &blink_on_write device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_write_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_WRITE);
+}
+
+/**
+ * blink_on_flush_show() - &blink_on_flush device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in
+ * &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_flush_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_FLUSH);
+}
+
+/**
+ * blink_on_flush_store() - &blink_on_flush device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_flush_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_FLUSH);
+}
+
+/**
+ * blink_on_discard_show() - &blink_on_discard device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in
+ * &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_discard_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_DISCARD);
+}
+
+/**
+ * blink_on_discard_store() - &blink_on_discard device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_discard_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_DISCARD);
+}
+
+/* Device attributes */
+static DEVICE_ATTR_WO(link_dev_by_path);
+static DEVICE_ATTR_WO(unlink_dev_by_path);
+static DEVICE_ATTR_WO(unlink_dev_by_name);
+static DEVICE_ATTR_RW(blink_time);
+static DEVICE_ATTR_RW(check_interval);
+static DEVICE_ATTR_RW(blink_on_read);
+static DEVICE_ATTR_RW(blink_on_write);
+static DEVICE_ATTR_RW(blink_on_flush);
+static DEVICE_ATTR_RW(blink_on_discard);
+
+/* Device attributes in LED directory (/sys/class/leds/<led>/...) */
+static struct attribute *blkdev_trig_attrs[] = {
+	&dev_attr_link_dev_by_path.attr,
+	&dev_attr_unlink_dev_by_path.attr,
+	&dev_attr_unlink_dev_by_name.attr,
+	&dev_attr_blink_time.attr,
+	&dev_attr_check_interval.attr,
+	&dev_attr_blink_on_read.attr,
+	&dev_attr_blink_on_write.attr,
+	&dev_attr_blink_on_flush.attr,
+	&dev_attr_blink_on_discard.attr,
+	NULL
+};
+
+/* Unnamed attribute group == no subdirectory */
+static const struct attribute_group blkdev_trig_attr_group = {
+	.attrs	= blkdev_trig_attrs,
+};
+
+/* Attribute groups for the trigger */
+static const struct attribute_group *blkdev_trig_attr_groups[] = {
+	&blkdev_trig_attr_group,   /* /sys/class/leds/<led>/... */
+	&blkdev_trig_linked_devs,  /* /sys/class/leds/<led>/linked_devices/ */
+	NULL
+};
+
+/* Trigger registration data */
+static struct led_trigger blkdev_trig_trigger = {
+	.name		= "blkdev",
+	.activate	= blkdev_trig_activate,
+	.deactivate	= blkdev_trig_deactivate,
+	.groups		= blkdev_trig_attr_groups,
+};
+
+/**
+ * blkdev_trig_init() - Block device LED trigger initialization.
+ *
+ * Registers the ``blkdev`` LED trigger.
+ *
+ * Return:	&0 on success, negative &errno on failure.
+ */
+static int __init blkdev_trig_init(void)
+{
+	return led_trigger_register(&blkdev_trig_trigger);
+}
+module_init(blkdev_trig_init);
+
+/**
+ * blkdev_trig_exit() - Block device LED trigger module exit.
+ *
+ * Unregisters the ``blkdev`` LED trigger.
+ */
+static void __exit blkdev_trig_exit(void)
+{
+	led_trigger_unregister(&blkdev_trig_trigger);
+}
+module_exit(blkdev_trig_exit);
+
+MODULE_DESCRIPTION("Block device LED trigger");
+MODULE_AUTHOR("Ian Pilcher <arequipeno@gmail.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index eccecd3fac90..4cad490028ab 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -57,13 +57,7 @@
  * we need a lock that will allow us to sleep. This lock is a
  * mutex (ep->mtx). It is acquired during the event transfer loop,
  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
- * Then we also need a global mutex to serialize eventpoll_release_file()
- * and ep_free().
- * This mutex is acquired by ep_free() during the epoll file
- * cleanup path and it is also acquired by eventpoll_release_file()
- * if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
- * It is also acquired when inserting an epoll fd onto another epoll
+ * The epmutex is acquired when inserting an epoll fd onto another epoll
  * fd. We do this so that we walk the epoll tree and ensure that this
  * insertion does not create a cycle of epoll file descriptors, which
  * could lead to deadlock. We need a global mutex to prevent two
@@ -153,6 +147,13 @@ struct epitem {
 	/* The file descriptor information this item refers to */
 	struct epoll_filefd ffd;
 
+	/*
+	 * Protected by file->f_lock, true for to-be-released epitem already
+	 * removed from the "struct file" items list; together with
+	 * eventpoll->refcount orchestrates "struct eventpoll" disposal
+	 */
+	bool dying;
+
 	/* List containing poll wait queues */
 	struct eppoll_entry *pwqlist;
 
@@ -217,6 +218,12 @@ struct eventpoll {
 	u64 gen;
 	struct hlist_head refs;
 
+	/*
+	 * usage count, used together with epitem->dying to
+	 * orchestrate the disposal of this struct
+	 */
+	refcount_t refcount;
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
 	unsigned int napi_id;
@@ -240,9 +247,7 @@ struct ep_pqueue {
 /* Maximum number of epoll watched descriptors, per user */
 static long max_user_watches __read_mostly;
 
-/*
- * This mutex is used to serialize ep_free() and eventpoll_release_file().
- */
+/* Used for cycles detection */
 static DEFINE_MUTEX(epmutex);
 
 static u64 loop_check_gen = 0;
@@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 
 /*
  * This function unregisters poll callbacks from the associated file
- * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
- * ep_free).
+ * descriptor.  Must be called with "mtx" held.
  */
 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
@@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head)
 	kmem_cache_free(epi_cache, epi);
 }
 
+static void ep_get(struct eventpoll *ep)
+{
+	refcount_inc(&ep->refcount);
+}
+
+/*
+ * Returns true if the event poll can be disposed
+ */
+static bool ep_refcount_dec_and_test(struct eventpoll *ep)
+{
+	if (!refcount_dec_and_test(&ep->refcount))
+		return false;
+
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
+	return true;
+}
+
+static void ep_free(struct eventpoll *ep)
+{
+	mutex_destroy(&ep->mtx);
+	free_uid(ep->user);
+	wakeup_source_unregister(ep->ws);
+	kfree(ep);
+}
+
 /*
  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  * all the associated resources. Must be called with "mtx" held.
+ * If the dying flag is set, do the removal only if force is true.
+ * This prevents ep_clear_and_put() from dropping all the ep references
+ * while running concurrently with eventpoll_release_file().
+ * Returns true if the eventpoll can be disposed.
  */
-static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 {
 	struct file *file = epi->ffd.file;
 	struct epitems_head *to_free;
@@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
+	if (epi->dying && !force) {
+		spin_unlock(&file->f_lock);
+		return false;
+	}
+
 	to_free = NULL;
 	head = file->f_ep;
 	if (head->first == &epi->fllink && !epi->fllink.next) {
@@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	call_rcu(&epi->rcu, epi_rcu_free);
 
 	percpu_counter_dec(&ep->user->epoll_watches);
+	return ep_refcount_dec_and_test(ep);
+}
 
-	return 0;
+/*
+ * ep_remove variant for callers owing an additional reference to the ep
+ */
+static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
+{
+	WARN_ON_ONCE(__ep_remove(ep, epi, false));
 }
 
-static void ep_free(struct eventpoll *ep)
+static void ep_clear_and_put(struct eventpoll *ep)
 {
 	struct rb_node *rbp;
 	struct epitem *epi;
+	bool dispose;
 
 	/* We need to release all tasks waiting for these file */
 	if (waitqueue_active(&ep->poll_wait))
 		ep_poll_safewake(ep, NULL, 0);
 
-	/*
-	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
-	 * We do not need to hold "ep->mtx" here because the epoll file
-	 * is on the way to be removed and no one has references to it
-	 * anymore. The only hit might come from eventpoll_release_file() but
-	 * holding "epmutex" is sufficient here.
-	 */
-	mutex_lock(&epmutex);
+	mutex_lock(&ep->mtx);
 
 	/*
 	 * Walks through the whole tree by unregistering poll callbacks.
@@ -768,25 +806,21 @@ static void ep_free(struct eventpoll *ep)
 
 	/*
 	 * Walks through the whole tree by freeing each "struct epitem". At this
-	 * point we are sure no poll callbacks will be lingering around, and also by
-	 * holding "epmutex" we can be sure that no file cleanup code will hit
-	 * us during this operation. So we can avoid the lock on "ep->lock".
-	 * We do not need to lock ep->mtx, either, we only do it to prevent
-	 * a lockdep warning.
+	 * point we are sure no poll callbacks will be lingering around.
+	 * Since we still own a reference to the eventpoll struct, the loop can't
+	 * dispose it.
 	 */
-	mutex_lock(&ep->mtx);
 	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
 		epi = rb_entry(rbp, struct epitem, rbn);
-		ep_remove(ep, epi);
+		ep_remove_safe(ep, epi);
 		cond_resched();
 	}
+
+	dispose = ep_refcount_dec_and_test(ep);
 	mutex_unlock(&ep->mtx);
 
-	mutex_unlock(&epmutex);
-	mutex_destroy(&ep->mtx);
-	free_uid(ep->user);
-	wakeup_source_unregister(ep->ws);
-	kfree(ep);
+	if (dispose)
+		ep_free(ep);
 }
 
 static int ep_eventpoll_release(struct inode *inode, struct file *file)
@@ -794,7 +828,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	struct eventpoll *ep = file->private_data;
 
 	if (ep)
-		ep_free(ep);
+		ep_clear_and_put(ep);
 
 	return 0;
 }
@@ -906,33 +940,34 @@ void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
 	struct epitem *epi;
-	struct hlist_node *next;
+	bool dispose;
 
 	/*
-	 * We don't want to get "file->f_lock" because it is not
-	 * necessary. It is not necessary because we're in the "struct file"
-	 * cleanup path, and this means that no one is using this file anymore.
-	 * So, for example, epoll_ctl() cannot hit here since if we reach this
-	 * point, the file counter already went to zero and fget() would fail.
-	 * The only hit might come from ep_free() but by holding the mutex
-	 * will correctly serialize the operation. We do need to acquire
-	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
-	 * from anywhere but ep_free().
-	 *
-	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
+	 * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
+	 * touching the epitems list before eventpoll_release_file() can access
+	 * the ep->mtx.
 	 */
-	mutex_lock(&epmutex);
-	if (unlikely(!file->f_ep)) {
-		mutex_unlock(&epmutex);
-		return;
-	}
-	hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
+again:
+	spin_lock(&file->f_lock);
+	if (file->f_ep && file->f_ep->first) {
+		epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
+		epi->dying = true;
+		spin_unlock(&file->f_lock);
+
+		/*
+		 * ep access is safe as we still own a reference to the ep
+		 * struct
+		 */
 		ep = epi->ep;
-		mutex_lock_nested(&ep->mtx, 0);
-		ep_remove(ep, epi);
+		mutex_lock(&ep->mtx);
+		dispose = __ep_remove(ep, epi, true);
 		mutex_unlock(&ep->mtx);
+
+		if (dispose)
+			ep_free(ep);
+		goto again;
 	}
-	mutex_unlock(&epmutex);
+	spin_unlock(&file->f_lock);
 }
 
 static int ep_alloc(struct eventpoll **pep)
@@ -955,6 +990,7 @@ static int ep_alloc(struct eventpoll **pep)
 	ep->rbr = RB_ROOT_CACHED;
 	ep->ovflist = EP_UNACTIVE_PTR;
 	ep->user = user;
+	refcount_set(&ep->refcount, 1);
 
 	*pep = ep;
 
@@ -1223,10 +1259,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 		 */
 		list_del_init(&wait->entry);
 		/*
-		 * ->whead != NULL protects us from the race with ep_free()
-		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
-		 * held by the caller. Once we nullify it, nothing protects
-		 * ep/epi or even wait.
+		 * ->whead != NULL protects us from the race with
+		 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
+		 * takes whead->lock held by the caller. Once we nullify it,
+		 * nothing protects ep/epi or even wait.
 		 */
 		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
 	}
@@ -1496,16 +1532,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	if (tep)
 		mutex_unlock(&tep->mtx);
 
+	/*
+	 * ep_remove_safe() calls in the later error paths can't lead to
+	 * ep_free() as the ep file itself still holds an ep reference.
+	 */
+	ep_get(ep);
+
 	/* now check if we've created too many backpaths */
 	if (unlikely(full_check && reverse_path_check())) {
-		ep_remove(ep, epi);
+		ep_remove_safe(ep, epi);
 		return -EINVAL;
 	}
 
 	if (epi->event.events & EPOLLWAKEUP) {
 		error = ep_create_wakeup_source(epi);
 		if (error) {
-			ep_remove(ep, epi);
+			ep_remove_safe(ep, epi);
 			return error;
 		}
 	}
@@ -1529,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	 * high memory pressure.
 	 */
 	if (unlikely(!epq.epi)) {
-		ep_remove(ep, epi);
+		ep_remove_safe(ep, epi);
 		return -ENOMEM;
 	}
 
@@ -1760,10 +1802,6 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
 {
 	int ret = default_wake_function(wq_entry, mode, sync, key);
 
-	/*
-	 * Pairs with list_empty_careful in ep_poll, and ensures future loop
-	 * iterations see the cause of this wakeup.
-	 */
 	list_del_init_careful(&wq_entry->entry);
 	return ret;
 }
@@ -2029,7 +2067,7 @@ static int do_epoll_create(int flags)
 out_free_fd:
 	put_unused_fd(fd);
 out_free_ep:
-	ep_free(ep);
+	ep_clear_and_put(ep);
 	return error;
 }
 
@@ -2171,10 +2209,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			error = -EEXIST;
 		break;
 	case EPOLL_CTL_DEL:
-		if (epi)
-			error = ep_remove(ep, epi);
-		else
+		if (epi) {
+			/*
+			 * The eventpoll itself is still alive: the refcount
+			 * can't go to zero here.
+			 */
+			ep_remove_safe(ep, epi);
+			error = 0;
+		} else {
 			error = -ENOENT;
+		}
 		break;
 	case EPOLL_CTL_MOD:
 		if (epi) {
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 5f1ae07d724b..97cda629c9e9 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -48,7 +48,7 @@ extern unsigned int pageblock_order;
 #else /* CONFIG_HUGETLB_PAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		(MAX_ORDER-1)
+#define pageblock_order		PAGE_ALLOC_COSTLY_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5628a7b5eaa..40647d707fb3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -322,6 +322,9 @@ struct pmu {
 	/* number of address filters this PMU can do */
 	unsigned int			nr_addr_filters;
 
+	/* Skip creating pmu device and sysfs interface. */
+	bool				skip_sysfs_dev;
+
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
 	 * as well as for lazy/batch writing of the MSRs.
@@ -827,6 +830,14 @@ struct perf_event {
 	void *security;
 #endif
 	struct list_head		sb_list;
+
+	/*
+	 * Certain events gets forwarded to another pmu internally by over-
+	 * writing kernel copy of event->attr.type without user being aware
+	 * of it. event->orig_type contains original 'type' requested by
+	 * user.
+	 */
+	__u32				orig_type;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index db016e418931..d3176f060eeb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event)
 		return;
 
 	send_sig_perf((void __user *)event->pending_addr,
-		      event->attr.type, event->attr.sig_data);
+		      event->orig_type, event->attr.sig_data);
 }
 
 /*
@@ -9951,6 +9951,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	swevent_hlist_put();
 }
 
+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
 static int perf_swevent_init(struct perf_event *event)
 {
 	u64 event_id = event->attr.config;
@@ -9966,7 +9969,10 @@ static int perf_swevent_init(struct perf_event *event)
 
 	switch (event_id) {
 	case PERF_COUNT_SW_CPU_CLOCK:
+		event->attr.type = perf_cpu_clock.type;
+		return -ENOENT;
 	case PERF_COUNT_SW_TASK_CLOCK:
+		event->attr.type = perf_task_clock.type;
 		return -ENOENT;
 
 	default:
@@ -11098,7 +11104,7 @@ static void cpu_clock_event_read(struct perf_event *event)
 
 static int cpu_clock_event_init(struct perf_event *event)
 {
-	if (event->attr.type != PERF_TYPE_SOFTWARE)
+	if (event->attr.type != perf_cpu_clock.type)
 		return -ENOENT;
 
 	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -11119,6 +11125,7 @@ static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
 	.capabilities	= PERF_PMU_CAP_NO_NMI,
+	.skip_sysfs_dev	= true,
 
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
@@ -11179,7 +11186,7 @@ static void task_clock_event_read(struct perf_event *event)
 
 static int task_clock_event_init(struct perf_event *event)
 {
-	if (event->attr.type != PERF_TYPE_SOFTWARE)
+	if (event->attr.type != perf_task_clock.type)
 		return -ENOENT;
 
 	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -11200,6 +11207,7 @@ static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
 	.capabilities	= PERF_PMU_CAP_NO_NMI,
+	.skip_sysfs_dev	= true,
 
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
@@ -11427,31 +11435,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		goto unlock;
 
 	pmu->type = -1;
-	if (!name)
-		goto skip_type;
+	if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
+		ret = -EINVAL;
+		goto free_pdc;
+	}
+
 	pmu->name = name;
 
-	if (type != PERF_TYPE_SOFTWARE) {
-		if (type >= 0)
-			max = type;
+	if (type >= 0)
+		max = type;
 
-		ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
-		if (ret < 0)
-			goto free_pdc;
+	ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto free_pdc;
 
-		WARN_ON(type >= 0 && ret != type);
+	WARN_ON(type >= 0 && ret != type);
 
-		type = ret;
-	}
+	type = ret;
 	pmu->type = type;
 
-	if (pmu_bus_running) {
+	if (pmu_bus_running && !pmu->skip_sysfs_dev) {
 		ret = pmu_dev_alloc(pmu);
 		if (ret)
 			goto free_idr;
 	}
 
-skip_type:
 	ret = -ENOMEM;
 	pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
 	if (!pmu->cpu_pmu_context)
@@ -11493,16 +11501,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	if (!pmu->event_idx)
 		pmu->event_idx = perf_event_idx_default;
 
-	/*
-	 * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
-	 * since these cannot be in the IDR. This way the linear search
-	 * is fast, provided a valid software event is provided.
-	 */
-	if (type == PERF_TYPE_SOFTWARE || !name)
-		list_add_rcu(&pmu->entry, &pmus);
-	else
-		list_add_tail_rcu(&pmu->entry, &pmus);
-
+	list_add_rcu(&pmu->entry, &pmus);
 	atomic_set(&pmu->exclusive_cnt, 0);
 	ret = 0;
 unlock:
@@ -11515,8 +11514,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	put_device(pmu->dev);
 
 free_idr:
-	if (pmu->type != PERF_TYPE_SOFTWARE)
-		idr_remove(&pmu_idr, pmu->type);
+	idr_remove(&pmu_idr, pmu->type);
 
 free_pdc:
 	free_percpu(pmu->pmu_disable_count);
@@ -11537,8 +11535,7 @@ void perf_pmu_unregister(struct pmu *pmu)
 	synchronize_rcu();
 
 	free_percpu(pmu->pmu_disable_count);
-	if (pmu->type != PERF_TYPE_SOFTWARE)
-		idr_remove(&pmu_idr, pmu->type);
+	idr_remove(&pmu_idr, pmu->type);
 	if (pmu_bus_running) {
 		if (pmu->nr_addr_filters)
 			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
@@ -11613,6 +11610,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
 
 	idx = srcu_read_lock(&pmus_srcu);
 
+	/*
+	 * Save original type before calling pmu->event_init() since certain
+	 * pmus overwrites event->attr.type to forward event to another pmu.
+	 */
+	event->orig_type = event->attr.type;
+
 	/* Try parent's PMU first: */
 	if (event->parent && event->parent->pmu) {
 		pmu = event->parent->pmu;
@@ -11657,16 +11660,6 @@ static struct pmu *perf_init_event(struct perf_event *event)
 		goto unlock;
 	}
 
-	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
-		ret = perf_try_init_event(pmu, event);
-		if (!ret)
-			goto unlock;
-
-		if (ret != -ENOENT) {
-			pmu = ERR_PTR(ret);
-			goto unlock;
-		}
-	}
 fail:
 	pmu = ERR_PTR(-ENOENT);
 unlock:
@@ -13652,8 +13645,8 @@ void __init perf_event_init(void)
 	perf_event_init_all_cpus();
 	init_srcu_struct(&pmus_srcu);
 	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
-	perf_pmu_register(&perf_cpu_clock, NULL, -1);
-	perf_pmu_register(&perf_task_clock, NULL, -1);
+	perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+	perf_pmu_register(&perf_task_clock, "task_clock", -1);
 	perf_tp_register();
 	perf_event_init_cpu(smp_processor_id());
 	register_reboot_notifier(&perf_reboot_notifier);
@@ -13696,7 +13689,7 @@ static int __init perf_event_sysfs_init(void)
 		goto unlock;
 
 	list_for_each_entry(pmu, &pmus, entry) {
-		if (!pmu->name || pmu->type < 0)
+		if (pmu->skip_sysfs_dev)
 			continue;
 
 		ret = pmu_dev_alloc(pmu);
diff --git a/kernel/padata.c b/kernel/padata.c
index e007b8a4b738..7c80301ab084 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -45,7 +45,7 @@ struct padata_mt_job_state {
 };
 
 static void padata_free_pd(struct parallel_data *pd);
-static void __init padata_mt_helper(struct work_struct *work);
+static void padata_mt_helper(struct work_struct *work);
 
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
@@ -438,7 +438,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
 	return err;
 }
 
-static void __init padata_mt_helper(struct work_struct *w)
+static void padata_mt_helper(struct work_struct *w)
 {
 	struct padata_work *pw = container_of(w, struct padata_work, pw_work);
 	struct padata_mt_job_state *ps = pw->pw_data;
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 6130c42b8e59..e19199f4a684 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -39,7 +39,7 @@ static long INIT nofill(void *buffer, unsigned long len)
 }
 
 /* Included from initramfs et al code */
-STATIC int INIT __gunzip(unsigned char *buf, long len,
+static int INIT __gunzip(unsigned char *buf, long len,
 		       long (*fill)(void*, unsigned long),
 		       long (*flush)(void*, unsigned long),
 		       unsigned char *out_buf, long out_len,
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index 9f4262ee33a5..353268b9f129 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -102,6 +102,8 @@
  */
 #ifdef STATIC
 #	define XZ_PREBOOT
+#else
+#include <linux/decompress/unxz.h>
 #endif
 #ifdef __KERNEL__
 #	include <linux/decompress/mm.h>
diff --git a/lib/decompress_unzstd.c b/lib/decompress_unzstd.c
index a512b99ae16a..bba2c0bb10cb 100644
--- a/lib/decompress_unzstd.c
+++ b/lib/decompress_unzstd.c
@@ -69,6 +69,8 @@
 # define UNZSTD_PREBOOT
 # include "xxhash.c"
 # include "zstd/decompress_sources.h"
+#else
+#include <linux/decompress/unzstd.h>
 #endif
 
 #include <linux/decompress/mm.h>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index afcfb2a94e6e..c92b8df24d08 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3138,6 +3138,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 {
 	unsigned long flags;
 	int i, allocated = 0;
+	struct list_head *prev_tail = list->prev;
+	struct page *pos, *n;
 
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
@@ -3146,9 +3148,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		if (unlikely(page == NULL))
 			break;
 
-		if (unlikely(check_pcp_refill(page, order)))
-			continue;
-
 		/*
 		 * Split buddy pages returned by expand() are received here in
 		 * physical page order. The page is added to the tail of
@@ -3160,7 +3159,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * pages are ordered properly.
 		 */
 		list_add_tail(&page->pcp_list, list);
-		allocated++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
@@ -3174,6 +3172,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	 */
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock_irqrestore(&zone->lock, flags);
+
+	/*
+	 * Pages are appended to the pcp list without checking to reduce the
+	 * time holding the zone lock. Checking the appended pages happens right
+	 * after the critical section while still holding the pcp lock.
+	 */
+	pos = list_first_entry(prev_tail, struct page, pcp_list);
+	list_for_each_entry_safe_from(pos, n, list, pcp_list) {
+		if (unlikely(check_pcp_refill(pos, order))) {
+			list_del(&pos->pcp_list);
+			continue;
+		}
+
+		allocated++;
+	}
+
 	return allocated;
 }
 
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 0edfdb40364b..ae52d3b3f063 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN     $@
 
 .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \
 		vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE
-	$(call if_changed,gen_initcalls_lds)
+	+$(call if_changed,gen_initcalls_lds)
 
 targets := .tmp_initcalls.lds
 
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index 75020edd39e7..e4455220e9fd 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -1239,7 +1239,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
 
 	if (strncmp(hid, "CLSA0100", 8) == 0) {
 		hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH;
-	} else if (strncmp(hid, "CLSA0101", 8) == 0) {
+	} else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) {
 		hw_cfg->bst_type = CS35L41_EXT_BOOST;
 		hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH;
 		hw_cfg->gpio1.valid = true;
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index fb9ac5dc4079..ff7234653503 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -69,6 +69,7 @@ perf-y += dlfilter-test.o
 perf-y += sigtrap.o
 perf-y += event_groups.o
 perf-y += symbols.o
+perf-y += amd-ibs-via-core-pmu.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/amd-ibs-via-core-pmu.c b/tools/perf/tests/amd-ibs-via-core-pmu.c
new file mode 100644
index 000000000000..6f6eb2d84fde
--- /dev/null
+++ b/tools/perf/tests/amd-ibs-via-core-pmu.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "linux/perf_event.h"
+#include "tests.h"
+#include "pmu.h"
+#include "pmus.h"
+#include "../perf-sys.h"
+#include "debug.h"
+
+#define NR_SUB_TESTS 5
+
+static struct sub_tests {
+	int type;
+	unsigned long config;
+	bool valid;
+} sub_tests[NR_SUB_TESTS] = {
+	{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, true },
+	{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, false },
+	{ PERF_TYPE_RAW, 0x076, true },
+	{ PERF_TYPE_RAW, 0x0C1, true },
+	{ PERF_TYPE_RAW, 0x012, false },
+};
+
+static int event_open(int type, unsigned long config)
+{
+	struct perf_event_attr attr;
+
+	memset(&attr, 0, sizeof(struct perf_event_attr));
+	attr.type = type;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.config = config;
+	attr.disabled = 1;
+	attr.precise_ip = 1;
+	attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.sample_period = 100000;
+
+	return sys_perf_event_open(&attr, -1, 0, -1, 0);
+}
+
+static int test__amd_ibs_via_core_pmu(struct test_suite *text __maybe_unused,
+				      int subtest __maybe_unused)
+{
+	struct perf_pmu *ibs_pmu;
+	int ret = TEST_OK;
+	int fd, i;
+
+	if (list_empty(&pmus))
+		perf_pmu__scan(NULL);
+
+	ibs_pmu = perf_pmu__find("ibs_op");
+	if (!ibs_pmu)
+		return TEST_SKIP;
+
+	for (i = 0; i < NR_SUB_TESTS; i++) {
+		fd = event_open(sub_tests[i].type, sub_tests[i].config);
+		pr_debug("type: 0x%x, config: 0x%lx, fd: %d  -  ", sub_tests[i].type,
+			 sub_tests[i].config, fd);
+		if ((sub_tests[i].valid && fd == -1) ||
+		    (!sub_tests[i].valid && fd > 0)) {
+			pr_debug("Fail\n");
+			ret = TEST_FAIL;
+		} else {
+			pr_debug("Pass\n");
+		}
+
+		if (fd > 0)
+			close(fd);
+	}
+
+	return ret;
+}
+
+DEFINE_SUITE("AMD IBS via core pmu", amd_ibs_via_core_pmu);
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 35cc3807cc9e..1805a4fae762 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -119,6 +119,7 @@ static struct test_suite *generic_tests[] = {
 	&suite__sigtrap,
 	&suite__event_groups,
 	&suite__symbols,
+	&suite__amd_ibs_via_core_pmu,
 	NULL,
 };
 
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 9a0f3904e53d..65589d40638d 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -149,6 +149,7 @@ DECLARE_SUITE(dlfilter);
 DECLARE_SUITE(sigtrap);
 DECLARE_SUITE(event_groups);
 DECLARE_SUITE(symbols);
+DECLARE_SUITE(amd_ibs_via_core_pmu);
 
 /*
  * PowerPC and S390 do not support creation of instruction breakpoints using the
-- 
2.41.0

From 5c50dca9c7cd9866e6539d020badc568980da445 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Thu, 22 Jun 2023 19:55:44 +0200
Subject: [PATCH 05/12] fs-patches

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 block/Kconfig                     |    3 +
 block/blk-cgroup.c                |   78 +-
 block/blk-cgroup.h                |   15 +-
 block/blk-core.c                  |    3 -
 fs/btrfs/Kconfig                  |    1 +
 fs/btrfs/bio.c                    |  211 +-
 fs/btrfs/bio.h                    |   22 +-
 fs/btrfs/block-group.c            |   40 +-
 fs/btrfs/block-group.h            |   13 +-
 fs/btrfs/block-rsv.c              |   21 +-
 fs/btrfs/block-rsv.h              |    2 +-
 fs/btrfs/btrfs_inode.h            |   35 +-
 fs/btrfs/compression.c            |  299 +--
 fs/btrfs/compression.h            |   20 +-
 fs/btrfs/ctree.c                  |   91 +-
 fs/btrfs/ctree.h                  |   17 +-
 fs/btrfs/delalloc-space.c         |    2 +-
 fs/btrfs/delayed-ref.c            |   49 +-
 fs/btrfs/delayed-ref.h            |   22 +-
 fs/btrfs/disk-io.c                |  147 +-
 fs/btrfs/extent-tree.c            |   37 +-
 fs/btrfs/extent_io.c              |  550 ++--
 fs/btrfs/file-item.c              |   93 +-
 fs/btrfs/file-item.h              |    3 +-
 fs/btrfs/fs.h                     |   53 +-
 fs/btrfs/inode-item.c             |   15 +-
 fs/btrfs/inode.c                  |  375 ++-
 fs/btrfs/locking.c                |   25 +-
 fs/btrfs/locking.h                |    5 +-
 fs/btrfs/lru_cache.h              |    5 -
 fs/btrfs/lzo.c                    |   17 +-
 fs/btrfs/messages.c               |    2 +-
 fs/btrfs/messages.h               |    2 +-
 fs/btrfs/ordered-data.c           |  120 +-
 fs/btrfs/ordered-data.h           |   10 +-
 fs/btrfs/raid56.c                 |  162 +-
 fs/btrfs/raid56.h                 |   12 +-
 fs/btrfs/relocation.c             |    6 +-
 fs/btrfs/scrub.c                  | 4142 +++++++++--------------------
 fs/btrfs/space-info.c             |   32 +-
 fs/btrfs/space-info.h             |    1 +
 fs/btrfs/super.c                  |    3 +-
 fs/btrfs/sysfs.c                  |    5 +
 fs/btrfs/tests/extent-map-tests.c |    1 -
 fs/btrfs/transaction.c            |   28 +-
 fs/btrfs/tree-checker.c           |   14 +
 fs/btrfs/tree-log.c               |  171 +-
 fs/btrfs/volumes.c                |  591 ++--
 fs/btrfs/volumes.h                |   85 +-
 fs/btrfs/zlib.c                   |    2 -
 fs/btrfs/zoned.c                  |    4 +-
 fs/btrfs/zstd.c                   |    1 -
 fs/xfs/libxfs/xfs_alloc.c         |   30 +-
 fs/xfs/libxfs/xfs_alloc.h         |    2 +
 fs/xfs/scrub/repair.c             |    4 +-
 fs/xfs/xfs_extent_busy.c          |   34 +-
 fs/xfs/xfs_extent_busy.h          |    6 +-
 fs/xfs/xfs_extfree_item.c         |   37 +-
 fs/xfs/xfs_filestream.c           |    1 -
 fs/xfs/xfs_icache.c               |   38 +-
 fs/xfs/xfs_log_recover.c          |   23 +-
 fs/xfs/xfs_trans_ail.c            |    2 +-
 fs/xfs/xfs_trans_priv.h           |    1 +
 fs/xfs/xfs_xattr.c                |    4 +-
 include/linux/bio.h               |    5 +
 include/linux/blk_types.h         |   18 +-
 include/linux/crc32c.h            |    1 -
 include/linux/writeback.h         |    5 -
 include/trace/events/btrfs.h      |    2 +-
 lib/libcrc32c.c                   |    6 -
 tools/objtool/check.c             |    1 +
 71 files changed, 2999 insertions(+), 4884 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 941b2dca70db..69ccf7457ae1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -41,6 +41,9 @@ config BLK_RQ_ALLOC_TIME
 config BLK_CGROUP_RWSTAT
 	bool
 
+config BLK_CGROUP_PUNT_BIO
+	bool
+
 config BLK_DEV_BSG_COMMON
 	tristate
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index dd6d1c0117b1..38ddf13d7a7c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -58,7 +58,6 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
 bool blkcg_debug_stats = false;
-static struct workqueue_struct *blkcg_punt_bio_wq;
 
 static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
 
@@ -172,7 +171,9 @@ static void __blkg_release(struct rcu_head *rcu)
 	struct blkcg *blkcg = blkg->blkcg;
 	int cpu;
 
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	WARN_ON(!bio_list_empty(&blkg->async_bios));
+#endif
 	/*
 	 * Flush all the non-empty percpu lockless lists before releasing
 	 * us, given these stat belongs to us.
@@ -202,6 +203,9 @@ static void blkg_release(struct percpu_ref *ref)
 	call_rcu(&blkg->rcu_head, __blkg_release);
 }
 
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
+static struct workqueue_struct *blkcg_punt_bio_wq;
+
 static void blkg_async_bio_workfn(struct work_struct *work)
 {
 	struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
@@ -212,10 +216,10 @@ static void blkg_async_bio_workfn(struct work_struct *work)
 	bool need_plug = false;
 
 	/* as long as there are pending bios, @blkg can't go away */
-	spin_lock_bh(&blkg->async_bio_lock);
+	spin_lock(&blkg->async_bio_lock);
 	bio_list_merge(&bios, &blkg->async_bios);
 	bio_list_init(&blkg->async_bios);
-	spin_unlock_bh(&blkg->async_bio_lock);
+	spin_unlock(&blkg->async_bio_lock);
 
 	/* start plug only when bio_list contains at least 2 bios */
 	if (bios.head && bios.head->bi_next) {
@@ -228,6 +232,40 @@ static void blkg_async_bio_workfn(struct work_struct *work)
 		blk_finish_plug(&plug);
 }
 
+/*
+ * When a shared kthread issues a bio for a cgroup, doing so synchronously can
+ * lead to priority inversions as the kthread can be trapped waiting for that
+ * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
+ * a dedicated per-blkcg work item to avoid such priority inversions.
+ */
+void blkcg_punt_bio_submit(struct bio *bio)
+{
+	struct blkcg_gq *blkg = bio->bi_blkg;
+
+	if (blkg->parent) {
+		spin_lock(&blkg->async_bio_lock);
+		bio_list_add(&blkg->async_bios, bio);
+		spin_unlock(&blkg->async_bio_lock);
+		queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+	} else {
+		/* never bounce for the root cgroup */
+		submit_bio(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
+
+static int __init blkcg_punt_bio_init(void)
+{
+	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
+					    WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!blkcg_punt_bio_wq)
+		return -ENOMEM;
+	return 0;
+}
+subsys_initcall(blkcg_punt_bio_init);
+#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */
+
 /**
  * bio_blkcg_css - return the blkcg CSS associated with a bio
  * @bio: target bio
@@ -283,10 +321,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 
 	blkg->q = disk->queue;
 	INIT_LIST_HEAD(&blkg->q_node);
+	blkg->blkcg = blkcg;
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	spin_lock_init(&blkg->async_bio_lock);
 	bio_list_init(&blkg->async_bios);
 	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
-	blkg->blkcg = blkcg;
+#endif
 
 	u64_stats_init(&blkg->iostat.sync);
 	for_each_possible_cpu(cpu) {
@@ -1714,25 +1754,6 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
-bool __blkcg_punt_bio_submit(struct bio *bio)
-{
-	struct blkcg_gq *blkg = bio->bi_blkg;
-
-	/* consume the flag first */
-	bio->bi_opf &= ~REQ_CGROUP_PUNT;
-
-	/* never bounce for the root cgroup */
-	if (!blkg->parent)
-		return false;
-
-	spin_lock_bh(&blkg->async_bio_lock);
-	bio_list_add(&blkg->async_bios, bio);
-	spin_unlock_bh(&blkg->async_bio_lock);
-
-	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
-	return true;
-}
-
 /*
  * Scale the accumulated delay based on how long it has been since we updated
  * the delay.  We only call this when we are adding delay, in case it's been a
@@ -2110,16 +2131,5 @@ bool blk_cgroup_congested(void)
 	return ret;
 }
 
-static int __init blkcg_init(void)
-{
-	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
-					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
-					    WQ_UNBOUND | WQ_SYSFS, 0);
-	if (!blkcg_punt_bio_wq)
-		return -ENOMEM;
-	return 0;
-}
-subsys_initcall(blkcg_init);
-
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9c5078755e5e..e98d2c1be354 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -72,9 +72,10 @@ struct blkcg_gq {
 	struct blkg_iostat_set		iostat;
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
-
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
 	spinlock_t			async_bio_lock;
 	struct bio_list			async_bios;
+#endif
 	union {
 		struct work_struct	async_bio_work;
 		struct work_struct	free_work;
@@ -375,16 +376,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),	\
 					    (p_blkg)->q)))
 
-bool __blkcg_punt_bio_submit(struct bio *bio);
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
-{
-	if (bio->bi_opf & REQ_CGROUP_PUNT)
-		return __blkcg_punt_bio_submit(bio);
-	else
-		return false;
-}
-
 static inline void blkcg_bio_issue_init(struct bio *bio)
 {
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
@@ -506,8 +497,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return
 static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline void blk_cgroup_bio_start(struct bio *bio) { }
 static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 42926e6cb83c..478978dcb2bd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct);
  */
 void submit_bio(struct bio *bio)
 {
-	if (blkcg_punt_bio_submit(bio))
-		return;
-
 	if (bio_op(bio) == REQ_OP_READ) {
 		task_io_account_read(bio->bi_iter.bi_size);
 		count_vm_events(PGPGIN, bio_sectors(bio));
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 37b6bab90c83..66fa9ab2c046 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -2,6 +2,7 @@
 
 config BTRFS_FS
 	tristate "Btrfs filesystem support"
+	select BLK_CGROUP_PUNT_BIO
 	select CRYPTO
 	select CRYPTO_CRC32C
 	select LIBCRC32C
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index ada899613486..35d34c731d72 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -31,11 +31,11 @@ struct btrfs_failed_bio {
  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
  * is already initialized by the block layer.
  */
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 		    btrfs_bio_end_io_t end_io, void *private)
 {
 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-	bbio->inode = inode;
+	bbio->fs_info = fs_info;
 	bbio->end_io = end_io;
 	bbio->private = private;
 	atomic_set(&bbio->pending_ios, 1);
@@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
  * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
  * a mempool.
  */
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-			    struct btrfs_inode *inode,
-			    btrfs_bio_end_io_t end_io, void *private)
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+				  struct btrfs_fs_info *fs_info,
+				  btrfs_bio_end_io_t end_io, void *private)
 {
+	struct btrfs_bio *bbio;
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
-	btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
-	return bio;
+	bbio = btrfs_bio(bio);
+	btrfs_bio_init(bbio, fs_info, end_io, private);
+	return bbio;
 }
 
-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
-				   struct bio *orig, u64 map_length,
-				   bool use_append)
+static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
 {
-	struct btrfs_bio *orig_bbio = btrfs_bio(orig);
+	struct btrfs_ordered_extent *ordered;
+	int ret;
+
+	ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+	if (WARN_ON_ONCE(!ordered))
+		return BLK_STS_IOERR;
+	ret = btrfs_extract_ordered_extent(bbio, ordered);
+	btrfs_put_ordered_extent(ordered);
+
+	return errno_to_blk_status(ret);
+}
+
+static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+					 struct btrfs_bio *orig_bbio,
+					 u64 map_length, bool use_append)
+{
+	struct btrfs_bio *bbio;
 	struct bio *bio;
 
 	if (use_append) {
 		unsigned int nr_segs;
 
-		bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
+		bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
 				   &btrfs_clone_bioset, map_length);
 	} else {
-		bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
-				&btrfs_clone_bioset);
+		bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
+				GFP_NOFS, &btrfs_clone_bioset);
 	}
-	btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
-
-	btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
-	if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
+	bbio = btrfs_bio(bio);
+	btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
+	bbio->inode = orig_bbio->inode;
+	bbio->file_offset = orig_bbio->file_offset;
+	if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
 		orig_bbio->file_offset += map_length;
 
 	atomic_inc(&orig_bbio->pending_ios);
-	return bio;
+	return bbio;
 }
 
 static void btrfs_orig_write_end_io(struct bio *bio);
@@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
 			goto done;
 		}
 
-		btrfs_submit_bio(&repair_bbio->bio, mirror);
+		btrfs_submit_bio(repair_bbio, mirror);
 		return;
 	}
 
@@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
 	repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
 				      &btrfs_repair_bioset);
 	repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
-	bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+	__bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
 
 	repair_bbio = btrfs_bio(repair_bio);
-	btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
+	btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
+	repair_bbio->inode = failed_bbio->inode;
 	repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
 
 	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
 	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
-	btrfs_submit_bio(repair_bio, mirror);
+	btrfs_submit_bio(repair_bbio, mirror);
 	return fbio;
 }
 
@@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
 	struct btrfs_failed_bio *fbio = NULL;
 	u32 offset = 0;
 
+	/* Read-repair requires the inode field to be set by the submitter. */
+	ASSERT(inode);
+
 	/*
 	 * Hand off repair bios to the repair code as there is no upper level
 	 * submitter for them.
@@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work)
 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
 
 	/* Metadata reads are checked and repaired by the submitter. */
-	if (bbio->bio.bi_opf & REQ_META)
-		btrfs_orig_bbio_end_io(bbio);
-	else
+	if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
 		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+	else
+		btrfs_orig_bbio_end_io(bbio);
 }
 
 static void btrfs_simple_end_io(struct bio *bio)
 {
 	struct btrfs_bio *bbio = btrfs_bio(bio);
 	struct btrfs_device *dev = bio->bi_private;
-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 
 	btrfs_bio_counter_dec(fs_info);
 
@@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio)
 
 	btrfs_bio_counter_dec(bioc->fs_info);
 	bbio->mirror_num = bioc->mirror_num;
-	if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
+	if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
+	    !(bbio->bio.bi_opf & REQ_META))
 		btrfs_check_read_bio(bbio, NULL);
 	else
 		btrfs_orig_bbio_end_io(bbio);
@@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 		dev->devid, bio->bi_iter.bi_size);
 
 	btrfsic_check_bio(bio);
-	submit_bio(bio);
+
+	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
+		blkcg_punt_bio_submit(bio);
+	else
+		submit_bio(bio);
 }
 
 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
@@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work)
 
 	/*
 	 * All of the bios that pass through here are from async helpers.
-	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
-	 * This changes nothing when cgroups aren't in use.
+	 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
+	 * context.  This changes nothing when cgroups aren't in use.
 	 */
-	bio->bi_opf |= REQ_CGROUP_PUNT;
+	bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
 	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
 }
 
@@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
 	 * in order.
 	 */
 	if (bbio->bio.bi_opf & REQ_META) {
-		struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+		struct btrfs_fs_info *fs_info = bbio->fs_info;
 
 		if (btrfs_is_zoned(fs_info))
 			return false;
@@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 				struct btrfs_io_context *bioc,
 				struct btrfs_io_stripe *smap, int mirror_num)
 {
-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct async_submit_bio *async;
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 	return true;
 }
 
-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 {
-	struct btrfs_bio *bbio = btrfs_bio(bio);
 	struct btrfs_inode *inode = bbio->inode;
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct btrfs_bio *orig_bbio = bbio;
+	struct bio *bio = &bbio->bio;
 	u64 logical = bio->bi_iter.bi_sector << 9;
 	u64 length = bio->bi_iter.bi_size;
 	u64 map_length = length;
@@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
 		map_length = min(map_length, fs_info->max_zone_append_size);
 
 	if (map_length < length) {
-		bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
-		bbio = btrfs_bio(bio);
+		bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+		bio = &bbio->bio;
 	}
 
 	/*
 	 * Save the iter for the end_io handler and preload the checksums for
 	 * data reads.
 	 */
-	if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
+	if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
 		bbio->saved_iter = bio->bi_iter;
 		ret = btrfs_lookup_bio_sums(bbio);
 		if (ret)
@@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
 		if (use_append) {
 			bio->bi_opf &= ~REQ_OP_WRITE;
 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
-			ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
+			ret = btrfs_bio_extract_ordered_extent(bbio);
 			if (ret)
 				goto fail_put_bio;
 		}
@@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
 		 * Csum items for reloc roots have already been cloned at this
 		 * point, so they are handled as part of the no-checksum case.
 		 */
-		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
+		if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
 		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
 		    !btrfs_is_data_reloc_root(inode->root)) {
 			if (should_async_write(bbio) &&
@@ -686,9 +712,12 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
 	return true;
 }
 
-void btrfs_submit_bio(struct bio *bio, int mirror_num)
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
 {
-	while (!btrfs_submit_chunk(bio, mirror_num))
+	/* If bbio->inode is not populated, its file_offset must be 0. */
+	ASSERT(bbio->inode || bbio->file_offset == 0);
+
+	while (!btrfs_submit_chunk(bbio, mirror_num))
 		;
 }
 
@@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 			    u64 length, u64 logical, struct page *page,
 			    unsigned int pg_offset, int mirror_num)
 {
-	struct btrfs_device *dev;
+	struct btrfs_io_stripe smap = { 0 };
 	struct bio_vec bvec;
 	struct bio bio;
-	u64 map_length = 0;
-	u64 sector;
-	struct btrfs_io_context *bioc = NULL;
 	int ret = 0;
 
 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	if (btrfs_repair_one_zone(fs_info, logical))
 		return 0;
 
-	map_length = length;
-
 	/*
 	 * Avoid races with device replace and make sure our bioc has devices
 	 * associated to its stripes that don't go away while we are doing the
 	 * read repair operation.
 	 */
 	btrfs_bio_counter_inc_blocked(fs_info);
-	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
-		/*
-		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
-		 * to update all raid stripes, but here we just want to correct
-		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
-		 * stripe's dev and sector.
-		 */
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-				      &map_length, &bioc, 0);
-		if (ret)
-			goto out_counter_dec;
-		ASSERT(bioc->mirror_num == 1);
-	} else {
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-				      &map_length, &bioc, mirror_num);
-		if (ret)
-			goto out_counter_dec;
-		/*
-		 * This happens when dev-replace is also running, and the
-		 * mirror_num indicates the dev-replace target.
-		 *
-		 * In this case, we don't need to do anything, as the read
-		 * error just means the replace progress hasn't reached our
-		 * read range, and later replace routine would handle it well.
-		 */
-		if (mirror_num != bioc->mirror_num)
-			goto out_counter_dec;
-	}
-
-	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
-	dev = bioc->stripes[bioc->mirror_num - 1].dev;
-	btrfs_put_bioc(bioc);
+	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+	if (ret < 0)
+		goto out_counter_dec;
 
-	if (!dev || !dev->bdev ||
-	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+	if (!smap.dev->bdev ||
+	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
 		ret = -EIO;
 		goto out_counter_dec;
 	}
 
-	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
-	bio.bi_iter.bi_sector = sector;
+	bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+	bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
 	__bio_add_page(&bio, page, length, pg_offset);
 
 	btrfsic_check_bio(&bio);
 	ret = submit_bio_wait(&bio);
 	if (ret) {
 		/* try to remap that extent elsewhere? */
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		goto out_bio_uninit;
 	}
 
 	btrfs_info_rl_in_rcu(fs_info,
 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
-			     ino, start, btrfs_dev_name(dev), sector);
+			     ino, start, btrfs_dev_name(smap.dev),
+			     smap.physical >> SECTOR_SHIFT);
 	ret = 0;
 
 out_bio_uninit:
@@ -791,6 +787,45 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	return ret;
 }
 
+/*
+ * Submit a btrfs_bio based repair write.
+ *
+ * If @dev_replace is true, the write would be submitted to dev-replace target.
+ */
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
+{
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
+	u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+	u64 length = bbio->bio.bi_iter.bi_size;
+	struct btrfs_io_stripe smap = { 0 };
+	int ret;
+
+	ASSERT(fs_info);
+	ASSERT(mirror_num > 0);
+	ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+	ASSERT(!bbio->inode);
+
+	btrfs_bio_counter_inc_blocked(fs_info);
+	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+	if (ret < 0)
+		goto fail;
+
+	if (dev_replace) {
+		if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
+			bbio->bio.bi_opf &= ~REQ_OP_WRITE;
+			bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+		}
+		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
+		smap.dev = fs_info->dev_replace.tgtdev;
+	}
+	__btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+	return;
+
+fail:
+	btrfs_bio_counter_dec(fs_info);
+	btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+}
+
 int __init btrfs_bioset_init(void)
 {
 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index 873ff85817f0..a8eca3a65673 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
  * passed to btrfs_submit_bio for mapping to the physical devices.
  */
 struct btrfs_bio {
-	/* Inode and offset into it that this I/O operates on. */
+	/*
+	 * Inode and offset into it that this I/O operates on.
+	 * Only set for data I/O.
+	 */
 	struct btrfs_inode *inode;
 	u64 file_offset;
 
@@ -58,6 +61,9 @@ struct btrfs_bio {
 	atomic_t pending_ios;
 	struct work_struct end_io_work;
 
+	/* File system that this I/O operates on. */
+	struct btrfs_fs_info *fs_info;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.
@@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
 int __init btrfs_bioset_init(void);
 void __cold btrfs_bioset_exit(void);
 
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 		    btrfs_bio_end_io_t end_io, void *private);
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-			    struct btrfs_inode *inode,
-			    btrfs_bio_end_io_t end_io, void *private);
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+				  struct btrfs_fs_info *fs_info,
+				  btrfs_bio_end_io_t end_io, void *private);
 
 static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 {
@@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 /* Bio only refers to one ordered extent. */
 #define REQ_BTRFS_ONE_ORDERED			REQ_DRV
 
-void btrfs_submit_bio(struct bio *bio, int mirror_num);
+/* Submit using blkcg_punt_bio_submit. */
+#define REQ_BTRFS_CGROUP_PUNT			REQ_FS_PRIVATE
+
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 			    u64 length, u64 logical, struct page *page,
 			    unsigned int pg_offset, int mirror_num);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 58ce5d44ce4d..590b03560265 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
 			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 						  cache);
 
-		/*
-		 * If not empty, someone is still holding mutex of
-		 * full_stripe_lock, which can only be released by caller.
-		 * And it will definitely cause use-after-free when caller
-		 * tries to release full stripe lock.
-		 *
-		 * No better way to resolve, but only to warn.
-		 */
-		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 		kfree(cache->free_space_ctl);
 		kfree(cache->physical_map);
 		kfree(cache);
@@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 
 	map = em->map_lookup;
 	data_stripe_length = em->orig_block_len;
-	io_stripe_size = map->stripe_len;
+	io_stripe_size = BTRFS_STRIPE_LEN;
 	chunk_start = em->start;
 
 	/* For RAID5/6 adjust to a full IO stripe length */
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-		io_stripe_size = map->stripe_len * nr_data_stripes(map);
+		io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
 
 	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
 	if (!buf) {
@@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 
 	for (i = 0; i < map->num_stripes; i++) {
 		bool already_inserted = false;
-		u64 stripe_nr;
-		u64 offset;
+		u32 stripe_nr;
+		u32 offset;
 		int j;
 
 		if (!in_range(physical, map->stripes[i].physical,
 			      data_stripe_length))
 			continue;
 
-		stripe_nr = physical - map->stripes[i].physical;
-		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
+		stripe_nr = (physical - map->stripes[i].physical) >>
+			    BTRFS_STRIPE_LEN_SHIFT;
+		offset = (physical - map->stripes[i].physical) &
+			 BTRFS_STRIPE_LEN_MASK;
 
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-				 BTRFS_BLOCK_GROUP_RAID10)) {
-			stripe_nr = stripe_nr * map->num_stripes + i;
-			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
-		}
+				 BTRFS_BLOCK_GROUP_RAID10))
+			stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
+					    map->sub_stripes);
 		/*
 		 * The remaining case would be for RAID56, multiply by
 		 * nr_data_stripes().  Alternatively, just use rmap_len below
 		 * instead of map->stripe_len
 		 */
-
 		bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
 
 		/* Ensure we don't add duplicate addresses */
@@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
 	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
 	atomic_set(&cache->frozen, 0);
 	mutex_init(&cache->free_space_lock);
-	cache->full_stripe_locks_root.root = RB_ROOT;
-	mutex_init(&cache->full_stripe_locks_root.lock);
 
 	return cache;
 }
@@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
 }
 
 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
-						 u64 bytes_used, u64 type,
+						 u64 type,
 						 u64 chunk_offset, u64 size)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
 
 	cache->length = size;
 	set_free_space_tree_thresholds(cache);
-	cache->used = bytes_used;
 	cache->flags = type;
 	cache->cached = BTRFS_CACHE_FINISHED;
 	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
 
 #ifdef CONFIG_BTRFS_DEBUG
 	if (btrfs_should_fragment_free_space(cache)) {
-		u64 new_bytes_used = size - bytes_used;
-
-		cache->space_info->bytes_used += new_bytes_used >> 1;
+		cache->space_info->bytes_used += size >> 1;
 		fragment_free_space(cache);
 	}
 #endif
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 6e4a0b429ac3..cc0e4b37db2d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -91,14 +91,6 @@ struct btrfs_caching_control {
 /* Once caching_thread() finds this much free space, it will wake up waiters. */
 #define CACHING_CTL_WAKE_UP SZ_2M
 
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
-	struct rb_root root;
-	struct mutex lock;
-};
-
 struct btrfs_block_group {
 	struct btrfs_fs_info *fs_info;
 	struct inode *inode;
@@ -229,9 +221,6 @@ struct btrfs_block_group {
 	 */
 	int swap_extents;
 
-	/* Record locked full stripes for RAID5/6 block group */
-	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
-
 	/*
 	 * Allocation offset for the block group to implement sequential
 	 * allocation. This is used only on a zoned filesystem.
@@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
 int btrfs_read_block_groups(struct btrfs_fs_info *info);
 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
-						 u64 bytes_used, u64 type,
+						 u64 type,
 						 u64 chunk_offset, u64 size);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 4bb4a4875872..ac18c43fadad 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -233,9 +233,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
 
-	if (!block_rsv)
-		return 0;
-
 	spin_lock(&block_rsv->lock);
 	num_bytes = mult_perc(block_rsv->size, min_percent);
 	if (block_rsv->reserved >= num_bytes)
@@ -246,17 +243,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
 }
 
 int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
-			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			   enum btrfs_reserve_flush_enum flush)
 {
-	u64 num_bytes = 0;
 	int ret = -ENOSPC;
 
 	if (!block_rsv)
 		return 0;
 
 	spin_lock(&block_rsv->lock);
-	num_bytes = min_reserved;
 	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
 	else
@@ -356,17 +351,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	/*
 	 * But we also want to reserve enough space so we can do the fallback
-	 * global reserve for an unlink, which is an additional 5 items (see the
-	 * comment in __unlink_start_trans for what we're modifying.)
+	 * global reserve for an unlink, which is an additional
+	 * BTRFS_UNLINK_METADATA_UNITS items.
 	 *
 	 * But we also need space for the delayed ref updates from the unlink,
-	 * so its 10, 5 for the actual operation, and 5 for the delayed ref
-	 * updates.
+	 * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
+	 * each unlink metadata item.
 	 */
-	min_items += 10;
+	min_items += BTRFS_UNLINK_METADATA_UNITS;
 
 	num_bytes = max_t(u64, num_bytes,
-			  btrfs_calc_insert_metadata_size(fs_info, min_items));
+			  btrfs_calc_insert_metadata_size(fs_info, min_items) +
+			  btrfs_calc_delayed_ref_bytes(fs_info,
+					       BTRFS_UNLINK_METADATA_UNITS));
 
 	spin_lock(&sinfo->lock);
 	spin_lock(&block_rsv->lock);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 4cc41c9aaa82..6dc781709aca 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 			enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
 int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
-			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			   enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9dc21622806e..ec2ae4406c16 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -142,11 +142,22 @@ struct btrfs_inode {
 	/* a local copy of root's last_log_commit */
 	int last_log_commit;
 
-	/*
-	 * Total number of bytes pending delalloc, used by stat to calculate the
-	 * real block usage of the file. This is used only for files.
-	 */
-	u64 delalloc_bytes;
+	union {
+		/*
+		 * Total number of bytes pending delalloc, used by stat to
+		 * calculate the real block usage of the file. This is used
+		 * only for files.
+		 */
+		u64 delalloc_bytes;
+		/*
+		 * The lowest possible index of the next dir index key which
+		 * points to an inode that needs to be logged.
+		 * This is used only for directories.
+		 * Use the helpers btrfs_get_first_dir_index_to_log() and
+		 * btrfs_set_first_dir_index_to_log() to access this field.
+		 */
+		u64 first_dir_index_to_log;
+	};
 
 	union {
 		/*
@@ -247,6 +258,17 @@ struct btrfs_inode {
 	struct inode vfs_inode;
 };
 
+static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode)
+{
+	return READ_ONCE(inode->first_dir_index_to_log);
+}
+
+static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
+						    u64 index)
+{
+	WRITE_ONCE(inode->first_dir_index_to_log, index);
+}
+
 static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+				 struct btrfs_ordered_extent *ordered);
 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 			u32 bio_offset, struct bio_vec *bv);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f42f31f22d13..2d0493f0a184 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -37,6 +37,8 @@
 #include "file-item.h"
 #include "super.h"
 
+struct bio_set btrfs_compressed_bioset;
+
 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
 
 const char* btrfs_compress_type2str(enum btrfs_compression_type type)
@@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
 	return NULL;
 }
 
+static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
+{
+	return container_of(bbio, struct compressed_bio, bbio);
+}
+
+static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
+						   u64 start, blk_opf_t op,
+						   btrfs_bio_end_io_t end_io)
+{
+	struct btrfs_bio *bbio;
+
+	bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
+					  GFP_NOFS, &btrfs_compressed_bioset));
+	btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
+	bbio->inode = inode;
+	bbio->file_offset = start;
+	return to_compressed_bio(bbio);
+}
+
 bool btrfs_compress_is_valid_type(const char *str, size_t len)
 {
 	int i;
@@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws,
 	}
 }
 
+static void btrfs_free_compressed_pages(struct compressed_bio *cb)
+{
+	for (unsigned int i = 0; i < cb->nr_pages; i++)
+		put_page(cb->compressed_pages[i]);
+	kfree(cb->compressed_pages);
+}
+
 static int btrfs_decompress_bio(struct compressed_bio *cb);
 
 static void end_compressed_bio_read(struct btrfs_bio *bbio)
 {
-	struct compressed_bio *cb = bbio->private;
-	unsigned int index;
-	struct page *page;
+	struct compressed_bio *cb = to_compressed_bio(bbio);
+	blk_status_t status = bbio->bio.bi_status;
 
-	if (bbio->bio.bi_status)
-		cb->status = bbio->bio.bi_status;
-	else
-		cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
-
-	/* Release the compressed pages */
-	for (index = 0; index < cb->nr_pages; index++) {
-		page = cb->compressed_pages[index];
-		page->mapping = NULL;
-		put_page(page);
-	}
-
-	/* Do io completion on the original bio */
-	btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
+	if (!status)
+		status = errno_to_blk_status(btrfs_decompress_bio(cb));
 
-	/* Finally free the cb struct */
-	kfree(cb->compressed_pages);
-	kfree(cb);
+	btrfs_free_compressed_pages(cb);
+	btrfs_bio_end_io(cb->orig_bbio, status);
 	bio_put(&bbio->bio);
 }
 
@@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
  * Clear the writeback bits on all of the file
  * pages for a compressed write
  */
-static noinline void end_compressed_writeback(struct inode *inode,
-					      const struct compressed_bio *cb)
+static noinline void end_compressed_writeback(const struct compressed_bio *cb)
 {
+	struct inode *inode = &cb->bbio.inode->vfs_inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	unsigned long index = cb->start >> PAGE_SHIFT;
 	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 	struct folio_batch fbatch;
-	const int errno = blk_status_to_errno(cb->status);
+	const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
 	int i;
 	int ret;
 
@@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode,
 	/* the inode may be gone now */
 }
 
-static void finish_compressed_bio_write(struct compressed_bio *cb)
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
 {
-	struct inode *inode = cb->inode;
-	unsigned int index;
+	struct compressed_bio *cb =
+		container_of(work, struct compressed_bio, write_end_work);
 
 	/*
 	 * Ok, we're the last bio for this extent, step one is to call back
 	 * into the FS and do all the end_io operations.
 	 */
-	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
+	btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL,
 			cb->start, cb->start + cb->len - 1,
-			cb->status == BLK_STS_OK);
+			cb->bbio.bio.bi_status == BLK_STS_OK);
 
 	if (cb->writeback)
-		end_compressed_writeback(inode, cb);
+		end_compressed_writeback(cb);
 	/* Note, our inode could be gone now */
 
-	/*
-	 * Release the compressed pages, these came from alloc_page and
-	 * are not attached to the inode at all
-	 */
-	for (index = 0; index < cb->nr_pages; index++) {
-		struct page *page = cb->compressed_pages[index];
-
-		page->mapping = NULL;
-		put_page(page);
-	}
-
-	/* Finally free the cb struct */
-	kfree(cb->compressed_pages);
-	kfree(cb);
-}
-
-static void btrfs_finish_compressed_write_work(struct work_struct *work)
-{
-	struct compressed_bio *cb =
-		container_of(work, struct compressed_bio, write_end_work);
-
-	finish_compressed_bio_write(cb);
+	btrfs_free_compressed_pages(cb);
+	bio_put(&cb->bbio.bio);
 }
 
 /*
@@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
  */
 static void end_compressed_bio_write(struct btrfs_bio *bbio)
 {
-	struct compressed_bio *cb = bbio->private;
-	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+	struct compressed_bio *cb = to_compressed_bio(bbio);
+	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
 
-	cb->status = bbio->bio.bi_status;
 	queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+}
 
-	bio_put(&bbio->bio);
+static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
+{
+	struct bio *bio = &cb->bbio.bio;
+	u32 offset = 0;
+
+	while (offset < cb->compressed_len) {
+		u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
+
+		/* Maximum compressed extent is smaller than bio size limit. */
+		__bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
+			       len, 0);
+		offset += len;
+	}
 }
 
 /*
@@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio)
  * This also checksums the file bytes and gets things ready for
  * the end io hooks.
  */
-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 				 unsigned int len, u64 disk_start,
 				 unsigned int compressed_len,
 				 struct page **compressed_pages,
 				 unsigned int nr_pages,
 				 blk_opf_t write_flags,
-				 struct cgroup_subsys_state *blkcg_css,
 				 bool writeback)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct bio *bio = NULL;
 	struct compressed_bio *cb;
-	u64 cur_disk_bytenr = disk_start;
-	blk_status_t ret = BLK_STS_OK;
 
 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 	       IS_ALIGNED(len, fs_info->sectorsize));
-	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
-	if (!cb)
-		return BLK_STS_RESOURCE;
-	cb->status = BLK_STS_OK;
-	cb->inode = &inode->vfs_inode;
+
+	write_flags |= REQ_BTRFS_ONE_ORDERED;
+
+	cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags,
+				  end_compressed_bio_write);
 	cb->start = start;
 	cb->len = len;
 	cb->compressed_pages = compressed_pages;
@@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 	cb->writeback = writeback;
 	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
 	cb->nr_pages = nr_pages;
+	cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
+	btrfs_add_compressed_bio_pages(cb);
 
-	if (blkcg_css) {
-		kthread_associate_blkcg(blkcg_css);
-		write_flags |= REQ_CGROUP_PUNT;
-	}
-
-	write_flags |= REQ_BTRFS_ONE_ORDERED;
-	bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
-			      BTRFS_I(cb->inode), end_compressed_bio_write, cb);
-	bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
-	btrfs_bio(bio)->file_offset = start;
-
-	while (cur_disk_bytenr < disk_start + compressed_len) {
-		u64 offset = cur_disk_bytenr - disk_start;
-		unsigned int index = offset >> PAGE_SHIFT;
-		unsigned int real_size;
-		unsigned int added;
-		struct page *page = compressed_pages[index];
-
-		/*
-		 * We have various limits on the real read size:
-		 * - page boundary
-		 * - compressed length boundary
-		 */
-		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
-		real_size = min_t(u64, real_size, compressed_len - offset);
-		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
-
-		added = bio_add_page(bio, page, real_size, offset_in_page(offset));
-		/*
-		 * Maximum compressed extent is smaller than bio size limit,
-		 * thus bio_add_page() should always success.
-		 */
-		ASSERT(added == real_size);
-		cur_disk_bytenr += added;
-	}
-
-	/* Finished the range. */
-	ASSERT(bio->bi_iter.bi_size);
-	btrfs_submit_bio(bio, 0);
-	if (blkcg_css)
-		kthread_associate_blkcg(NULL);
-	return ret;
-}
-
-static u64 bio_end_offset(struct bio *bio)
-{
-	struct bio_vec *last = bio_last_bvec_all(bio);
-
-	return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
+	btrfs_submit_bio(&cb->bbio, 0);
 }
 
 /*
@@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	unsigned long end_index;
-	u64 cur = bio_end_offset(cb->orig_bio);
+	struct bio *orig_bio = &cb->orig_bbio->bio;
+	u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
 	u64 isize = i_size_read(inode);
 	int ret;
 	struct page *page;
@@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 */
 		if (!em || cur < em->start ||
 		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
-		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
+		    (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) {
 			free_extent_map(em);
 			unlock_extent(tree, cur, page_end, NULL);
 			unlock_page(page);
@@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		}
 
 		add_size = min(em->start + em->len, page_end + 1) - cur;
-		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+		ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
 		if (ret != add_size) {
 			unlock_extent(tree, cur, page_end, NULL);
 			unlock_page(page);
@@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
  * After the compressed pages are read, we copy the bytes into the
  * bio we were passed and then call the bio end_io calls
  */
-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
-				  int mirror_num)
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct extent_map_tree *em_tree;
+	struct btrfs_inode *inode = bbio->inode;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_map_tree *em_tree = &inode->extent_tree;
 	struct compressed_bio *cb;
 	unsigned int compressed_len;
-	struct bio *comp_bio;
-	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
-	u64 cur_disk_byte = disk_bytenr;
-	u64 file_offset;
+	u64 file_offset = bbio->file_offset;
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
@@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	int memstall = 0;
 	blk_status_t ret;
 	int ret2;
-	int i;
-
-	em_tree = &BTRFS_I(inode)->extent_tree;
-
-	file_offset = bio_first_bvec_all(bio)->bv_offset +
-		      page_offset(bio_first_page_all(bio));
 
 	/* we need the actual starting offset of this extent in the file */
 	read_lock(&em_tree->lock);
@@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
 	compressed_len = em->block_len;
-	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
-	if (!cb) {
-		ret = BLK_STS_RESOURCE;
-		goto out;
-	}
 
-	cb->status = BLK_STS_OK;
-	cb->inode = inode;
+	cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
+				  end_compressed_bio_read);
 
 	cb->start = em->orig_start;
 	em_len = em->len;
 	em_start = em->start;
 
-	cb->len = bio->bi_iter.bi_size;
+	cb->len = bbio->bio.bi_iter.bi_size;
 	cb->compressed_len = compressed_len;
 	cb->compress_type = em->compress_type;
-	cb->orig_bio = bio;
+	cb->orig_bbio = bbio;
 
 	free_extent_map(em);
-	em = NULL;
 
 	cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
 	cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
 	if (!cb->compressed_pages) {
 		ret = BLK_STS_RESOURCE;
-		goto fail;
+		goto out_free_bio;
 	}
 
 	ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
 	if (ret2) {
 		ret = BLK_STS_RESOURCE;
-		goto fail;
+		goto out_free_compressed_pages;
 	}
 
-	add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
+	add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
+			 &pflags);
 
 	/* include any pages we added in add_ra-bio_pages */
-	cb->len = bio->bi_iter.bi_size;
-
-	comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
-				   end_compressed_bio_read, cb);
-	comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
-
-	while (cur_disk_byte < disk_bytenr + compressed_len) {
-		u64 offset = cur_disk_byte - disk_bytenr;
-		unsigned int index = offset >> PAGE_SHIFT;
-		unsigned int real_size;
-		unsigned int added;
-		struct page *page = cb->compressed_pages[index];
-
-		/*
-		 * We have various limit on the real read size:
-		 * - page boundary
-		 * - compressed length boundary
-		 */
-		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
-		real_size = min_t(u64, real_size, compressed_len - offset);
-		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
-
-		added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
-		/*
-		 * Maximum compressed extent is smaller than bio size limit,
-		 * thus bio_add_page() should always success.
-		 */
-		ASSERT(added == real_size);
-		cur_disk_byte += added;
-	}
+	cb->len = bbio->bio.bi_iter.bi_size;
+	cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
+	btrfs_add_compressed_bio_pages(cb);
 
 	if (memstall)
 		psi_memstall_leave(&pflags);
 
-	/*
-	 * Stash the initial offset of this chunk, as there is no direct
-	 * correlation between compressed pages and the original file offset.
-	 * The field is only used for printing error messages anyway.
-	 */
-	btrfs_bio(comp_bio)->file_offset = file_offset;
-
-	ASSERT(comp_bio->bi_iter.bi_size);
-	btrfs_submit_bio(comp_bio, mirror_num);
+	btrfs_submit_bio(&cb->bbio, mirror_num);
 	return;
 
-fail:
-	if (cb->compressed_pages) {
-		for (i = 0; i < cb->nr_pages; i++) {
-			if (cb->compressed_pages[i])
-				__free_page(cb->compressed_pages[i]);
-		}
-	}
-
+out_free_compressed_pages:
 	kfree(cb->compressed_pages);
-	kfree(cb);
+out_free_bio:
+	bio_put(&cb->bbio.bio);
 out:
-	free_extent_map(em);
-	btrfs_bio_end_io(btrfs_bio(bio), ret);
-	return;
+	btrfs_bio_end_io(bbio, ret);
 }
 
 /*
@@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
 	ret = compression_decompress_bio(workspace, cb);
 	put_workspace(type, workspace);
 
+	if (!ret)
+		zero_fill_bio(&cb->orig_bbio->bio);
 	return ret;
 }
 
@@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
 
 int __init btrfs_init_compress(void)
 {
+	if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
+			offsetof(struct compressed_bio, bbio.bio),
+			BIOSET_NEED_BVECS))
+		return -ENOMEM;
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
@@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void)
 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
 	zstd_cleanup_workspace_manager();
+	bioset_exit(&btrfs_compressed_bioset);
 }
 
 /*
@@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void)
 int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
 			      struct compressed_bio *cb, u32 decompressed)
 {
-	struct bio *orig_bio = cb->orig_bio;
+	struct bio *orig_bio = &cb->orig_bbio->bio;
 	/* Offset inside the full decompressed extent */
 	u32 cur_offset;
 
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index a5e3377db9ad..19ab2abeddc0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,8 +6,8 @@
 #ifndef BTRFS_COMPRESSION_H
 #define BTRFS_COMPRESSION_H
 
-#include <linux/blk_types.h>
 #include <linux/sizes.h>
+#include "bio.h"
 
 struct btrfs_inode;
 
@@ -23,6 +23,7 @@ struct btrfs_inode;
 
 /* Maximum length of compressed data stored on disk */
 #define BTRFS_MAX_COMPRESSED		(SZ_128K)
+#define BTRFS_MAX_COMPRESSED_PAGES	(BTRFS_MAX_COMPRESSED / PAGE_SIZE)
 static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
 
 /* Maximum size of data before compression */
@@ -37,9 +38,6 @@ struct compressed_bio {
 	/* the pages with the compressed data on them */
 	struct page **compressed_pages;
 
-	/* inode that owns this data */
-	struct inode *inode;
-
 	/* starting offset in the inode for our pages */
 	u64 start;
 
@@ -55,14 +53,14 @@ struct compressed_bio {
 	/* Whether this is a write for writeback. */
 	bool writeback;
 
-	/* IO errors */
-	blk_status_t status;
-
 	union {
 		/* For reads, this is the bio we are copying the data into */
-		struct bio *orig_bio;
+		struct btrfs_bio *orig_bbio;
 		struct work_struct write_end_work;
 	};
+
+	/* Must be last. */
+	struct btrfs_bio bbio;
 };
 
 static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
 int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
 			      struct compressed_bio *cb, u32 decompressed);
 
-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 				  unsigned int len, u64 disk_start,
 				  unsigned int compressed_len,
 				  struct page **compressed_pages,
 				  unsigned int nr_pages,
 				  blk_opf_t write_flags,
-				  struct cgroup_subsys_state *blkcg_css,
 				  bool writeback);
-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
-				  int mirror_num);
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num);
 
 unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 986827370d8e..adfc04bd362e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
  * Search for a key in the given extent_buffer.
  *
  * The lower boundary for the search is specified by the slot number @first_slot.
- * Use a value of 0 to search over the whole extent buffer.
+ * Use a value of 0 to search over the whole extent buffer. Works for both
+ * leaves and nodes.
  *
  * The slot in the extent buffer is returned via @slot. If the key exists in the
  * extent buffer, then @slot will point to the slot where the key is, otherwise
@@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
  * Slot may point to the total number of items (i.e. one position beyond the last
  * key) if the key is bigger than the last key in the extent buffer.
  */
-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
-			     const struct btrfs_key *key, int *slot)
+int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+		     const struct btrfs_key *key, int *slot)
 {
 	unsigned long p;
 	int item_size;
@@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
 		return ERR_PTR(-ENOENT);
 
-	BUG_ON(level == 0);
+	ASSERT(level);
 
 	check.level = level - 1;
 	check.transid = btrfs_node_ptr_generation(parent, slot);
@@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
 		return 0;
 
-	left = btrfs_read_node_slot(parent, pslot - 1);
-	if (IS_ERR(left))
-		left = NULL;
+	if (pslot) {
+		left = btrfs_read_node_slot(parent, pslot - 1);
+		if (IS_ERR(left)) {
+			ret = PTR_ERR(left);
+			left = NULL;
+			goto enospc;
+		}
 
-	if (left) {
 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left,
@@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	right = btrfs_read_node_slot(parent, pslot + 1);
-	if (IS_ERR(right))
-		right = NULL;
+	if (pslot + 1 < btrfs_header_nritems(parent)) {
+		right = btrfs_read_node_slot(parent, pslot + 1);
+		if (IS_ERR(right)) {
+			ret = PTR_ERR(right);
+			right = NULL;
+			goto enospc;
+		}
 
-	if (right) {
 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right,
@@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	if (!parent)
 		return 1;
 
-	left = btrfs_read_node_slot(parent, pslot - 1);
-	if (IS_ERR(left))
-		left = NULL;
-
 	/* first, try to make some room in the middle buffer */
-	if (left) {
+	if (pslot) {
 		u32 left_nr;
 
+		left = btrfs_read_node_slot(parent, pslot - 1);
+		if (IS_ERR(left))
+			return PTR_ERR(left);
+
 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
 
 		left_nr = btrfs_header_nritems(left);
@@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 	}
-	right = btrfs_read_node_slot(parent, pslot + 1);
-	if (IS_ERR(right))
-		right = NULL;
 
 	/*
 	 * then try to empty the right most buffer into the middle
 	 */
-	if (right) {
+	if (pslot + 1 < btrfs_header_nritems(parent)) {
 		u32 right_nr;
 
+		right = btrfs_read_node_slot(parent, pslot + 1);
+		if (IS_ERR(right))
+			return PTR_ERR(right);
+
 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
 
 		right_nr = btrfs_header_nritems(right);
@@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
 		return 0;
 	}
 
-	return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
+	return btrfs_bin_search(eb, search_low_slot, key, slot);
 }
 
 static int search_leaf(struct btrfs_trans_handle *trans,
@@ -2321,7 +2329,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 		 */
 		btrfs_unlock_up_safe(p, level + 1);
 
-		ret = btrfs_bin_search(b, key, &slot);
+		ret = btrfs_bin_search(b, 0, key, &slot);
 		if (ret < 0)
 			goto done;
 
@@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
 			      struct btrfs_path *path)
 {
-	while (1) {
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 		int ret;
-		const int slot = path->slots[0];
-		const struct extent_buffer *leaf = path->nodes[0];
 
-		/* This is where we start walking the path. */
-		if (slot >= btrfs_header_nritems(leaf)) {
-			/*
-			 * If we've reached the last slot in this leaf we need
-			 * to go to the next leaf and reset the path.
-			 */
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				return ret;
-			continue;
-		}
-		/* Store the found, valid item in @key. */
-		btrfs_item_key_to_cpu(leaf, key, slot);
-		break;
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			return ret;
 	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
 	return 0;
 }
 
@@ -3198,12 +3195,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_assert_tree_write_locked(path->nodes[1]);
 
 	right = btrfs_read_node_slot(upper, slot + 1);
-	/*
-	 * slot + 1 is not valid or we fail to read the right node,
-	 * no big deal, just return.
-	 */
 	if (IS_ERR(right))
-		return 1;
+		return PTR_ERR(right);
 
 	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
 
@@ -3418,12 +3411,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_assert_tree_write_locked(path->nodes[1]);
 
 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
-	/*
-	 * slot - 1 is not valid or we fail to read the left node,
-	 * no big deal, just return.
-	 */
 	if (IS_ERR(left))
-		return 1;
+		return PTR_ERR(left);
 
 	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
 
@@ -4608,7 +4597,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 	while (1) {
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
-		sret = btrfs_bin_search(cur, min_key, &slot);
+		sret = btrfs_bin_search(cur, 0, min_key, &slot);
 		if (sret < 0) {
 			ret = sret;
 			goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 97897107fab5..4c1986cd5bed 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 int __init btrfs_ctree_init(void);
 void __cold btrfs_ctree_exit(void);
 
-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
-			     const struct btrfs_key *key, int *slot);
+int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+		     const struct btrfs_key *key, int *slot);
 
-/*
- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
- */
-static inline int btrfs_bin_search(struct extent_buffer *eb,
-				   const struct btrfs_key *key,
-				   int *slot)
-{
-	return btrfs_generic_bin_search(eb, 0, key, slot);
-}
-
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
-		     int *slot);
 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 7ddb1d104e8e..427abaf608b8 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	 * racing with an ordered completion or some such that would think it
 	 * needs to free the reservation we just made.
 	 */
-	spin_lock(&inode->lock);
 	nr_extents = count_max_extents(fs_info, num_bytes);
+	spin_lock(&inode->lock);
 	btrfs_mod_outstanding_extents(inode, nr_extents);
 	inode->csum_bytes += disk_num_bytes;
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 886ffb232eac..0b32432d7d56 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
-{
-	u64 num_entries =
-		atomic_read(&trans->transaction->delayed_refs.num_entries);
-	u64 avg_runtime;
-	u64 val;
-
-	smp_mb();
-	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
-	val = num_entries * avg_runtime;
-	if (val >= NSEC_PER_SEC)
-		return 1;
-	if (val >= NSEC_PER_SEC / 2)
-		return 2;
-
-	return btrfs_check_space_for_delayed_refs(trans->fs_info);
-}
-
 /*
  * Release a ref head's reservation.
  *
@@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
-	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
+	const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
 	u64 released = 0;
 
-	/*
-	 * We have to check the mount option here because we could be enabling
-	 * the free space tree for the first time and don't have the compat_ro
-	 * option set yet.
-	 *
-	 * We need extra reservations if we have the free space tree because
-	 * we'll have to modify that tree as well.
-	 */
-	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
-		num_bytes *= 2;
-
 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
 	if (released)
 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 	if (!trans->delayed_ref_updates)
 		return;
 
-	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
-						    trans->delayed_ref_updates);
-	/*
-	 * We have to check the mount option here because we could be enabling
-	 * the free space tree for the first time and don't have the compat_ro
-	 * option set yet.
-	 *
-	 * We need extra reservations if we have the free space tree because
-	 * we'll have to modify that tree as well.
-	 */
-	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
-		num_bytes *= 2;
+	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
+						 trans->delayed_ref_updates);
 
 	spin_lock(&delayed_rsv->lock);
 	delayed_rsv->size += num_bytes;
@@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
-	u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
+	u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
 
@@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
 	if (ret)
 		return ret;
-	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
+	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
 	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
 				      0, num_bytes, 1);
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 2eb34abf700f..b54261fe509b 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
 int __init btrfs_delayed_ref_init(void);
 void __cold btrfs_delayed_ref_exit(void);
 
+static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info,
+					       int num_delayed_refs)
+{
+	u64 num_bytes;
+
+	num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs);
+
+	/*
+	 * We have to check the mount option here because we could be enabling
+	 * the free space tree for the first time and don't have the compat_ro
+	 * option set yet.
+	 *
+	 * We need extra reservations if we have the free space tree because
+	 * we'll have to modify that tree as well.
+	 */
+	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+		num_bytes *= 2;
+
+	return num_bytes;
+}
+
 static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
 				int action, u64 bytenr, u64 len, u64 parent)
 {
@@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
 				       struct btrfs_block_rsv *src,
 				       u64 num_bytes);
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a47d8ad6abcb..dabc79c1af1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1347,17 +1347,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
 {
 	int ret;
-	unsigned int nofs_flag;
 
-	/*
-	 * We might be called under a transaction (e.g. indirect backref
-	 * resolution) which could deadlock if it triggers memory reclaim
-	 */
-	nofs_flag = memalloc_nofs_save();
-	ret = btrfs_drew_lock_init(&root->snapshot_lock);
-	memalloc_nofs_restore(nofs_flag);
-	if (ret)
-		goto fail;
+	btrfs_drew_lock_init(&root->snapshot_lock);
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
 	    !btrfs_is_data_reloc_root(root)) {
@@ -2071,7 +2062,6 @@ void btrfs_put_root(struct btrfs_root *root)
 		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
 		if (root->anon_dev)
 			free_anon_bdev(root->anon_dev);
-		btrfs_drew_lock_destroy(&root->snapshot_lock);
 		free_root_extent_buffers(root);
 #ifdef CONFIG_BTRFS_DEBUG
 		spin_lock(&root->fs_info->fs_roots_radix_lock);
@@ -2131,11 +2121,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 	atomic_set(&fs_info->reloc_cancel_req, 0);
 }
 
-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
+static int btrfs_init_btree_inode(struct super_block *sb)
 {
-	struct inode *inode = fs_info->btree_inode;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
 					      fs_info->tree_root);
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return -ENOMEM;
 
 	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
 	set_nlink(inode, 1);
@@ -2146,6 +2141,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
 	 */
 	inode->i_size = OFFSET_MAX;
 	inode->i_mapping->a_ops = &btree_aops;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
 
 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
@@ -2158,6 +2154,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
 	BTRFS_I(inode)->location.offset = 0;
 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
 	__insert_inode_hash(inode, hash);
+	fs_info->btree_inode = inode;
+
+	return 0;
 }
 
 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2973,7 +2972,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	atomic64_set(&fs_info->free_chunk_space, 0);
 	fs_info->tree_mod_log = RB_ROOT;
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
 	btrfs_init_ref_verify(fs_info);
 
 	fs_info->thread_pool_size = min_t(unsigned long,
@@ -3362,14 +3360,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
 	int ret;
-	int err = -EINVAL;
 	int level;
 
 	ret = init_mount_fs_info(fs_info, sb);
-	if (ret) {
-		err = ret;
+	if (ret)
 		goto fail;
-	}
 
 	/* These need to be init'ed before we start creating inodes and such. */
 	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
@@ -3379,17 +3374,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 				      GFP_KERNEL);
 	fs_info->chunk_root = chunk_root;
 	if (!tree_root || !chunk_root) {
-		err = -ENOMEM;
+		ret = -ENOMEM;
 		goto fail;
 	}
 
-	fs_info->btree_inode = new_inode(sb);
-	if (!fs_info->btree_inode) {
-		err = -ENOMEM;
+	ret = btrfs_init_btree_inode(sb);
+	if (ret)
 		goto fail;
-	}
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-	btrfs_init_btree_inode(fs_info);
 
 	invalidate_bdev(fs_devices->latest_dev->bdev);
 
@@ -3398,7 +3389,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 */
 	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
 	if (IS_ERR(disk_super)) {
-		err = PTR_ERR(disk_super);
+		ret = PTR_ERR(disk_super);
 		goto fail_alloc;
 	}
 
@@ -3410,7 +3401,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	if (!btrfs_supported_super_csum(csum_type)) {
 		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
 			  csum_type);
-		err = -EINVAL;
+		ret = -EINVAL;
 		btrfs_release_disk_super(disk_super);
 		goto fail_alloc;
 	}
@@ -3419,7 +3410,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 
 	ret = btrfs_init_csum_hash(fs_info, csum_type);
 	if (ret) {
-		err = ret;
 		btrfs_release_disk_super(disk_super);
 		goto fail_alloc;
 	}
@@ -3430,7 +3420,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 */
 	if (btrfs_check_super_csum(fs_info, disk_super)) {
 		btrfs_err(fs_info, "superblock checksum mismatch");
-		err = -EINVAL;
+		ret = -EINVAL;
 		btrfs_release_disk_super(disk_super);
 		goto fail_alloc;
 	}
@@ -3460,12 +3450,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	ret = btrfs_validate_mount_super(fs_info);
 	if (ret) {
 		btrfs_err(fs_info, "superblock contains fatal errors");
-		err = -EINVAL;
+		ret = -EINVAL;
 		goto fail_alloc;
 	}
 
-	if (!btrfs_super_root(disk_super))
+	if (!btrfs_super_root(disk_super)) {
+		btrfs_err(fs_info, "invalid superblock tree root bytenr");
+		ret = -EINVAL;
 		goto fail_alloc;
+	}
 
 	/* check FS state, whether FS is broken. */
 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@@ -3492,16 +3485,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->stripesize = stripesize;
 
 	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
-	if (ret) {
-		err = ret;
+	if (ret)
 		goto fail_alloc;
-	}
 
 	ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
-	if (ret < 0) {
-		err = ret;
+	if (ret < 0)
 		goto fail_alloc;
-	}
 
 	if (sectorsize < PAGE_SIZE) {
 		struct btrfs_subpage_info *subpage_info;
@@ -3521,17 +3510,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		"read-write for sector size %u with page size %lu is experimental",
 			   sectorsize, PAGE_SIZE);
 		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
-		if (!subpage_info)
+		if (!subpage_info) {
+			ret = -ENOMEM;
 			goto fail_alloc;
+		}
 		btrfs_init_subpage_info(subpage_info, sectorsize);
 		fs_info->subpage_info = subpage_info;
 	}
 
 	ret = btrfs_init_workqueues(fs_info);
-	if (ret) {
-		err = ret;
+	if (ret)
 		goto fail_sb_buffer;
-	}
 
 	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
 	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3577,6 +3566,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	btrfs_free_extra_devids(fs_devices);
 	if (!fs_devices->latest_dev->bdev) {
 		btrfs_err(fs_info, "failed to read devices");
+		ret = -EIO;
 		goto fail_tree_roots;
 	}
 
@@ -3592,8 +3582,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	ret = btrfs_get_dev_zone_info_all_devices(fs_info);
 	if (ret) {
 		btrfs_err(fs_info,
-			  "zoned: failed to read device zone info: %d",
-			  ret);
+			  "zoned: failed to read device zone info: %d", ret);
 		goto fail_block_groups;
 	}
 
@@ -3672,19 +3661,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	    !btrfs_check_rw_degradable(fs_info, NULL)) {
 		btrfs_warn(fs_info,
 		"writable mount is not allowed due to too many missing devices");
+		ret = -EINVAL;
 		goto fail_sysfs;
 	}
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
 					       "btrfs-cleaner");
-	if (IS_ERR(fs_info->cleaner_kthread))
+	if (IS_ERR(fs_info->cleaner_kthread)) {
+		ret = PTR_ERR(fs_info->cleaner_kthread);
 		goto fail_sysfs;
+	}
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
 						   "btrfs-transaction");
-	if (IS_ERR(fs_info->transaction_kthread))
+	if (IS_ERR(fs_info->transaction_kthread)) {
+		ret = PTR_ERR(fs_info->transaction_kthread);
 		goto fail_cleaner;
+	}
 
 	if (!btrfs_test_opt(fs_info, NOSSD) &&
 	    !fs_info->fs_devices->rotating) {
@@ -3702,7 +3696,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	    fs_info->fs_devices->discardable) {
 		btrfs_set_and_info(fs_info, DISCARD_ASYNC,
 				   "auto enabling async discard");
-		btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
 	}
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -3729,16 +3722,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
 		btrfs_info(fs_info, "start tree-log replay");
 		ret = btrfs_replay_log(fs_info, fs_devices);
-		if (ret) {
-			err = ret;
+		if (ret)
 			goto fail_qgroup;
-		}
 	}
 
 	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
 	if (IS_ERR(fs_info->fs_root)) {
-		err = PTR_ERR(fs_info->fs_root);
-		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+		ret = PTR_ERR(fs_info->fs_root);
+		btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
 		fs_info->fs_root = NULL;
 		goto fail_qgroup;
 	}
@@ -3815,7 +3806,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	iput(fs_info->btree_inode);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
-	return err;
+	ASSERT(ret < 0);
+	return ret;
 }
 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
 
@@ -4112,6 +4104,8 @@ static void write_dev_flush(struct btrfs_device *device)
 {
 	struct bio *bio = &device->flush_bio;
 
+	device->last_flush_error = BLK_STS_OK;
+
 #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	/*
 	 * When a disk has write caching disabled, we skip submission of a bio
@@ -4140,25 +4134,24 @@ static void write_dev_flush(struct btrfs_device *device)
 
 /*
  * If the flush bio has been submitted by write_dev_flush, wait for it.
+ * Return true for any error, and false otherwise.
  */
-static blk_status_t wait_dev_flush(struct btrfs_device *device)
+static bool wait_dev_flush(struct btrfs_device *device)
 {
 	struct bio *bio = &device->flush_bio;
 
-	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
-		return BLK_STS_OK;
+	if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
+		return false;
 
-	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
 	wait_for_completion_io(&device->flush_wait);
 
-	return bio->bi_status;
-}
+	if (bio->bi_status) {
+		device->last_flush_error = bio->bi_status;
+		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
+		return true;
+	}
 
-static int check_barrier_error(struct btrfs_fs_info *fs_info)
-{
-	if (!btrfs_check_rw_degradable(fs_info, NULL))
-		return -EIO;
-	return 0;
+	return false;
 }
 
 /*
@@ -4170,7 +4163,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	struct list_head *head;
 	struct btrfs_device *dev;
 	int errors_wait = 0;
-	blk_status_t ret;
 
 	lockdep_assert_held(&info->fs_devices->device_list_mutex);
 	/* send down all the barriers */
@@ -4185,7 +4177,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			continue;
 
 		write_dev_flush(dev);
-		dev->last_flush_error = BLK_STS_OK;
 	}
 
 	/* wait for all the barriers */
@@ -4200,23 +4191,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
-		ret = wait_dev_flush(dev);
-		if (ret) {
-			dev->last_flush_error = ret;
-			btrfs_dev_stat_inc_and_print(dev,
-					BTRFS_DEV_STAT_FLUSH_ERRS);
+		if (wait_dev_flush(dev))
 			errors_wait++;
-		}
 	}
 
-	if (errors_wait) {
-		/*
-		 * At some point we need the status of all disks
-		 * to arrive at the volume status. So error checking
-		 * is being pushed to a separate loop.
-		 */
-		return check_barrier_error(info);
-	}
+	/*
+	 * Checks last_flush_error of disks in order to determine the device
+	 * state.
+	 */
+	if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
+		return -EIO;
+
 	return 0;
 }
 
@@ -4422,12 +4407,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 			root_objectid = gang[i]->root_key.objectid;
 			err = btrfs_orphan_cleanup(gang[i]);
 			if (err)
-				break;
+				goto out;
 			btrfs_put_root(gang[i]);
 		}
 		root_objectid++;
 	}
-
+out:
 	/* release the uncleaned roots due to error */
 	for (; i < ret; i++) {
 		if (gang[i])
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 824c657f59e8..5cd289de4e92 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 }
 
 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
-				    struct btrfs_delayed_ref_head *locked_ref,
-				    unsigned long *run_refs)
+					   struct btrfs_delayed_ref_head *locked_ref)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 			return -EAGAIN;
 		}
 
-		(*run_refs)++;
 		ref->in_tree = 0;
 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
 		RB_CLEAR_NODE(&ref->ref_node);
@@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
-	ktime_t start = ktime_get();
 	int ret;
 	unsigned long count = 0;
-	unsigned long actual_count = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	do {
@@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		spin_lock(&locked_ref->lock);
 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
 
-		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
-						      &actual_count);
+		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
 		if (ret < 0 && ret != -EAGAIN) {
 			/*
 			 * Error, btrfs_run_delayed_refs_for_head already
@@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		cond_resched();
 	} while ((nr != -1 && count < nr) || locked_ref);
 
-	/*
-	 * We don't want to include ref heads since we can have empty ref heads
-	 * and those will drastically skew our runtime down since we just do
-	 * accounting, no actual extent tree updates.
-	 */
-	if (actual_count > 0) {
-		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
-		u64 avg;
-
-		/*
-		 * We weigh the current average higher than our current runtime
-		 * to avoid large swings in the average.
-		 */
-		spin_lock(&delayed_refs->lock);
-		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
-		spin_unlock(&delayed_refs->lock);
-	}
 	return 0;
 }
 
@@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 {
 	int level = wc->level;
 	int lookup_info = 1;
-	int ret;
+	int ret = 0;
 
 	while (level >= 0) {
 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
-		if (ret > 0)
+		if (ret)
 			break;
 
 		if (level == 0)
@@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 			path->slots[level]++;
 			continue;
 		} else if (ret < 0)
-			return ret;
+			break;
 		level = wc->level;
 	}
-	return 0;
+	return (ret == 1) ? 0 : ret;
 }
 
 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
@@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 
 		ret = walk_down_tree(trans, root, path, wc);
 		if (ret < 0) {
+			btrfs_abort_transaction(trans, ret);
 			err = ret;
 			break;
 		}
 
 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
 		if (ret < 0) {
+			btrfs_abort_transaction(trans, ret);
 			err = ret;
 			break;
 		}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 40300e8e5f99..a1adadd5d25d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
  * how many bytes are there before stripe/ordered extent boundary.
  */
 struct btrfs_bio_ctrl {
-	struct bio *bio;
+	struct btrfs_bio *bbio;
 	int mirror_num;
 	enum btrfs_compression_type compress_type;
 	u32 len_to_oe_boundary;
+	blk_opf_t opf;
 	btrfs_bio_end_io_t end_io_func;
+	struct writeback_control *wbc;
 
 	/*
 	 * This is for metadata read, to provide the extra needed verification
@@ -117,51 +119,41 @@ struct btrfs_bio_ctrl {
 	 * does the unlocking.
 	 */
 	bool extent_locked;
-
-	/* Tell the submit_bio code to use REQ_SYNC */
-	bool sync_io;
 };
 
 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 {
-	struct bio *bio;
-	struct bio_vec *bv;
-	struct inode *inode;
-	int mirror_num;
+	struct btrfs_bio *bbio = bio_ctrl->bbio;
+	int mirror_num = bio_ctrl->mirror_num;
 
-	if (!bio_ctrl->bio)
+	if (!bbio)
 		return;
 
-	bio = bio_ctrl->bio;
-	bv = bio_first_bvec_all(bio);
-	inode = bv->bv_page->mapping->host;
-	mirror_num = bio_ctrl->mirror_num;
-
 	/* Caller should ensure the bio has at least some range added */
-	ASSERT(bio->bi_iter.bi_size);
+	ASSERT(bbio->bio.bi_iter.bi_size);
 
-	if (!is_data_inode(inode)) {
-		if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+	if (!is_data_inode(&bbio->inode->vfs_inode)) {
+		if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
 			/*
 			 * For metadata read, we should have the parent_check,
 			 * and copy it to bbio for metadata verification.
 			 */
 			ASSERT(bio_ctrl->parent_check);
-			memcpy(&btrfs_bio(bio)->parent_check,
+			memcpy(&bbio->parent_check,
 			       bio_ctrl->parent_check,
 			       sizeof(struct btrfs_tree_parent_check));
 		}
-		bio->bi_opf |= REQ_META;
+		bbio->bio.bi_opf |= REQ_META;
 	}
 
-	if (btrfs_op(bio) == BTRFS_MAP_READ &&
+	if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
 	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
-		btrfs_submit_compressed_read(inode, bio, mirror_num);
+		btrfs_submit_compressed_read(bbio, mirror_num);
 	else
-		btrfs_submit_bio(bio, mirror_num);
+		btrfs_submit_bio(bbio, mirror_num);
 
-	/* The bio is owned by the end_io handler now */
-	bio_ctrl->bio = NULL;
+	/* The bbio is owned by the end_io handler now */
+	bio_ctrl->bbio = NULL;
 }
 
 /*
@@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
  */
 static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
 {
-	struct bio *bio = bio_ctrl->bio;
+	struct btrfs_bio *bbio = bio_ctrl->bbio;
 
-	if (!bio)
+	if (!bbio)
 		return;
 
 	if (ret) {
 		ASSERT(ret < 0);
-		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+		btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
 		/* The bio is owned by the end_io handler now */
-		bio_ctrl->bio = NULL;
+		bio_ctrl->bbio = NULL;
 	} else {
 		submit_one_bio(bio_ctrl);
 	}
@@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
 	return 0;
 }
 
-/*
- * Attempt to add a page to bio.
- *
- * @bio_ctrl:       record both the bio, and its bio_flags
- * @page:	    page to add to the bio
- * @disk_bytenr:    offset of the new bio or to check whether we are adding
- *                  a contiguous page to the previous one
- * @size:	    portion of page that we want to write
- * @pg_offset:	    starting offset in the page
- * @compress_type:  compression type of the current bio to see if we can merge them
- *
- * Attempt to add a page to bio considering stripe alignment etc.
- *
- * Return >= 0 for the number of bytes added to the bio.
- * Can return 0 if the current bio is already at stripe/zone boundary.
- * Return <0 for error.
- */
-static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
-			      struct page *page,
-			      u64 disk_bytenr, unsigned int size,
-			      unsigned int pg_offset,
-			      enum btrfs_compression_type compress_type)
+static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
+				struct page *page, u64 disk_bytenr,
+				unsigned int pg_offset)
 {
-	struct bio *bio = bio_ctrl->bio;
-	u32 bio_size = bio->bi_iter.bi_size;
-	u32 real_size;
+	struct bio *bio = &bio_ctrl->bbio->bio;
+	struct bio_vec *bvec = bio_last_bvec_all(bio);
 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
-	bool contig = false;
 
-	ASSERT(bio);
-	/* The limit should be calculated when bio_ctrl->bio is allocated */
-	ASSERT(bio_ctrl->len_to_oe_boundary);
-	if (bio_ctrl->compress_type != compress_type)
-		return 0;
-
-
-	if (bio->bi_iter.bi_size == 0) {
-		/* We can always add a page into an empty bio. */
-		contig = true;
-	} else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
-		struct bio_vec *bvec = bio_last_bvec_all(bio);
-
-		/*
-		 * The contig check requires the following conditions to be met:
-		 * 1) The pages are belonging to the same inode
-		 *    This is implied by the call chain.
-		 *
-		 * 2) The range has adjacent logical bytenr
-		 *
-		 * 3) The range has adjacent file offset
-		 *    This is required for the usage of btrfs_bio->file_offset.
-		 */
-		if (bio_end_sector(bio) == sector &&
-		    page_offset(bvec->bv_page) + bvec->bv_offset +
-		    bvec->bv_len == page_offset(page) + pg_offset)
-			contig = true;
-	} else {
+	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
 		/*
-		 * For compression, all IO should have its logical bytenr
-		 * set to the starting bytenr of the compressed extent.
+		 * For compression, all IO should have its logical bytenr set
+		 * to the starting bytenr of the compressed extent.
 		 */
-		contig = bio->bi_iter.bi_sector == sector;
+		return bio->bi_iter.bi_sector == sector;
 	}
 
-	if (!contig)
-		return 0;
-
-	real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
-
 	/*
-	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
-	 * bio will still execute its endio function on the page!
+	 * The contig check requires the following conditions to be met:
+	 *
+	 * 1) The pages are belonging to the same inode
+	 *    This is implied by the call chain.
+	 *
+	 * 2) The range has adjacent logical bytenr
+	 *
+	 * 3) The range has adjacent file offset
+	 *    This is required for the usage of btrfs_bio->file_offset.
 	 */
-	if (real_size == 0)
-		return 0;
-
-	return bio_add_page(bio, page, real_size, pg_offset);
+	return bio_end_sector(bio) == sector &&
+		page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
+		page_offset(page) + pg_offset;
 }
 
-static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
-				struct btrfs_inode *inode, u64 file_offset)
+static void alloc_new_bio(struct btrfs_inode *inode,
+			  struct btrfs_bio_ctrl *bio_ctrl,
+			  u64 disk_bytenr, u64 file_offset)
 {
-	struct btrfs_ordered_extent *ordered;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_bio *bbio;
+
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
+			       bio_ctrl->end_io_func, NULL);
+	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->inode = inode;
+	bbio->file_offset = file_offset;
+	bio_ctrl->bbio = bbio;
+	bio_ctrl->len_to_oe_boundary = U32_MAX;
 
 	/*
 	 * Limit the extent to the ordered boundary for Zone Append.
@@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
 	 * them.
 	 */
 	if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
-	    btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
+	    btrfs_use_zone_append(bbio)) {
+		struct btrfs_ordered_extent *ordered;
+
 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
 		if (ordered) {
 			bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
 					ordered->file_offset +
 					ordered->disk_num_bytes - file_offset);
 			btrfs_put_ordered_extent(ordered);
-			return;
 		}
 	}
 
-	bio_ctrl->len_to_oe_boundary = U32_MAX;
-}
-
-static void alloc_new_bio(struct btrfs_inode *inode,
-			  struct btrfs_bio_ctrl *bio_ctrl,
-			  struct writeback_control *wbc, blk_opf_t opf,
-			  u64 disk_bytenr, u32 offset, u64 file_offset,
-			  enum btrfs_compression_type compress_type)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct bio *bio;
-
-	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
-			      NULL);
-	/*
-	 * For compressed page range, its disk_bytenr is always @disk_bytenr
-	 * passed in, no matter if we have added any range into previous bio.
-	 */
-	if (compress_type != BTRFS_COMPRESS_NONE)
-		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
-	else
-		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
-	btrfs_bio(bio)->file_offset = file_offset;
-	bio_ctrl->bio = bio;
-	bio_ctrl->compress_type = compress_type;
-	calc_bio_boundaries(bio_ctrl, inode, file_offset);
-
-	if (wbc) {
+	if (bio_ctrl->wbc) {
 		/*
 		 * Pick the last added device to support cgroup writeback.  For
 		 * multi-device file systems this means blk-cgroup policies have
 		 * to always be set on the last added/replaced device.
 		 * This is a bit odd but has been like that for a long time.
 		 */
-		bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
-		wbc_init_bio(wbc, bio);
+		bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
+		wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
 	}
 }
 
 /*
- * @opf:	bio REQ_OP_* and REQ_* flags as one value
- * @wbc:	optional writeback control for io accounting
  * @disk_bytenr: logical bytenr where the write will be
  * @page:	page to add to the bio
  * @size:	portion of page that we want to write to
  * @pg_offset:	offset of the new bio or to check whether we are adding
  *              a contiguous page to the previous one
- * @compress_type:   compress type for current bio
  *
- * The will either add the page into the existing @bio_ctrl->bio, or allocate a
- * new one in @bio_ctrl->bio.
+ * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
+ * new one in @bio_ctrl->bbio.
  * The mirror number for this IO should already be initizlied in
  * @bio_ctrl->mirror_num.
  */
-static int submit_extent_page(blk_opf_t opf,
-			      struct writeback_control *wbc,
-			      struct btrfs_bio_ctrl *bio_ctrl,
-			      u64 disk_bytenr, struct page *page,
-			      size_t size, unsigned long pg_offset,
-			      enum btrfs_compression_type compress_type,
-			      bool force_bio_submit)
+static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
+			       u64 disk_bytenr, struct page *page,
+			       size_t size, unsigned long pg_offset)
 {
 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
-	unsigned int cur = pg_offset;
-
-	ASSERT(bio_ctrl);
-
-	ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
-	       pg_offset + size <= PAGE_SIZE);
 
+	ASSERT(pg_offset + size <= PAGE_SIZE);
 	ASSERT(bio_ctrl->end_io_func);
 
-	if (force_bio_submit)
+	if (bio_ctrl->bbio &&
+	    !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
 		submit_one_bio(bio_ctrl);
 
-	while (cur < pg_offset + size) {
-		u32 offset = cur - pg_offset;
-		int added;
+	do {
+		u32 len = size;
 
 		/* Allocate new bio if needed */
-		if (!bio_ctrl->bio) {
-			alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
-				      offset, page_offset(page) + cur,
-				      compress_type);
+		if (!bio_ctrl->bbio) {
+			alloc_new_bio(inode, bio_ctrl, disk_bytenr,
+				      page_offset(page) + pg_offset);
 		}
-		/*
-		 * We must go through btrfs_bio_add_page() to ensure each
-		 * page range won't cross various boundaries.
-		 */
-		if (compress_type != BTRFS_COMPRESS_NONE)
-			added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
-					size - offset, pg_offset + offset,
-					compress_type);
-		else
-			added = btrfs_bio_add_page(bio_ctrl, page,
-					disk_bytenr + offset, size - offset,
-					pg_offset + offset, compress_type);
-
-		/* Metadata page range should never be split */
-		if (!is_data_inode(&inode->vfs_inode))
-			ASSERT(added == 0 || added == size - offset);
-
-		/* At least we added some page, update the account */
-		if (wbc && added)
-			wbc_account_cgroup_owner(wbc, page, added);
-
-		/* We have reached boundary, submit right now */
-		if (added < size - offset) {
-			/* The bio should contain some page(s) */
-			ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+
+		/* Cap to the current ordered extent boundary if there is one. */
+		if (len > bio_ctrl->len_to_oe_boundary) {
+			ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
+			ASSERT(is_data_inode(&inode->vfs_inode));
+			len = bio_ctrl->len_to_oe_boundary;
+		}
+
+		if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
+			/* bio full: move on to a new one */
 			submit_one_bio(bio_ctrl);
+			continue;
 		}
-		cur += added;
-	}
-	return 0;
+
+		if (bio_ctrl->wbc)
+			wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
+
+		size -= len;
+		pg_offset += len;
+		disk_bytenr += len;
+		bio_ctrl->len_to_oe_boundary -= len;
+
+		/* Ordered extent boundary: move on to a new bio. */
+		if (bio_ctrl->len_to_oe_boundary == 0)
+			submit_one_bio(bio_ctrl);
+	} while (size);
 }
 
 static int attach_extent_buffer_page(struct extent_buffer *eb,
@@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
  * return 0 on success, otherwise return error
  */
 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
-		      struct btrfs_bio_ctrl *bio_ctrl,
-		      blk_opf_t read_flags, u64 *prev_em_start)
+		      struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		unlock_extent(tree, start, end, NULL);
 		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
 		unlock_page(page);
-		goto out;
+		return ret;
 	}
 
 	if (page->index == last_byte >> PAGE_SHIFT) {
@@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 	bio_ctrl->end_io_func = end_bio_extent_readpage;
 	begin_page_read(fs_info, page);
 	while (cur <= end) {
-		unsigned long this_bio_flag = 0;
+		enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
 		bool force_bio_submit = false;
 		u64 disk_bytenr;
 
@@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		if (IS_ERR(em)) {
 			unlock_extent(tree, cur, end, NULL);
 			end_page_read(page, false, cur, end + 1 - cur);
-			ret = PTR_ERR(em);
-			break;
+			return PTR_ERR(em);
 		}
 		extent_offset = cur - em->start;
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-			this_bio_flag = em->compress_type;
+			compress_type = em->compress_type;
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		iosize = ALIGN(iosize, blocksize);
-		if (this_bio_flag != BTRFS_COMPRESS_NONE)
+		if (compress_type != BTRFS_COMPRESS_NONE)
 			disk_bytenr = em->block_start;
 		else
 			disk_bytenr = em->block_start + extent_offset;
@@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 			continue;
 		}
 
-		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
-					 bio_ctrl, disk_bytenr, page, iosize,
-					 pg_offset, this_bio_flag,
-					 force_bio_submit);
-		if (ret) {
-			/*
-			 * We have to unlock the remaining range, or the page
-			 * will never be unlocked.
-			 */
-			unlock_extent(tree, cur, end, NULL);
-			end_page_read(page, false, cur, end + 1 - cur);
-			goto out;
+		if (bio_ctrl->compress_type != compress_type) {
+			submit_one_bio(bio_ctrl);
+			bio_ctrl->compress_type = compress_type;
 		}
+
+		if (force_bio_submit)
+			submit_one_bio(bio_ctrl);
+		submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
+				   pg_offset);
 		cur = cur + iosize;
 		pg_offset += iosize;
 	}
-out:
-	return ret;
+
+	return 0;
 }
 
 int btrfs_read_folio(struct file *file, struct folio *folio)
@@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_SIZE - 1;
-	struct btrfs_bio_ctrl bio_ctrl = { 0 };
+	struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
 	int ret;
 
 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
 
-	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
 	/*
 	 * If btrfs_do_readpage() failed we will want to submit the assembled
 	 * bio to do the cleanup.
@@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
 
 	for (index = 0; index < nr_pages; index++) {
 		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
-				  REQ_RAHEAD, prev_em_start);
+				  prev_em_start);
 		put_page(pages[index]);
 	}
 }
@@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
  */
 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 				 struct page *page,
-				 struct writeback_control *wbc,
 				 struct btrfs_bio_ctrl *bio_ctrl,
 				 loff_t i_size,
 				 int *nr_ret)
@@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 	u64 extent_offset;
 	u64 block_start;
 	struct extent_map *em;
-	int saved_ret = 0;
 	int ret = 0;
 	int nr = 0;
-	enum req_op op = REQ_OP_WRITE;
-	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
-	bool has_error = false;
 	bool compressed;
 
 	ret = btrfs_writepage_cow_fixup(page);
 	if (ret) {
 		/* Fixup worker will requeue */
-		redirty_page_for_writepage(wbc, page);
+		redirty_page_for_writepage(bio_ctrl->wbc, page);
 		unlock_page(page);
 		return 1;
 	}
@@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 	 * we don't want to touch the inode after unlocking the page,
 	 * so we update the mapping writeback index now
 	 */
-	wbc->nr_to_write--;
+	bio_ctrl->wbc->nr_to_write--;
 
 	bio_ctrl->end_io_func = end_bio_extent_writepage;
 	while (cur <= end) {
@@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		if (IS_ERR(em)) {
 			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
 			ret = PTR_ERR_OR_ZERO(em);
-			has_error = true;
-			if (!saved_ret)
-				saved_ret = ret;
-			break;
+			goto out_error;
 		}
 
 		extent_offset = cur - em->start;
@@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		 */
 		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
 
-		ret = submit_extent_page(op | write_flags, wbc,
-					 bio_ctrl, disk_bytenr,
-					 page, iosize,
-					 cur - page_offset(page),
-					 0, false);
-		if (ret) {
-			has_error = true;
-			if (!saved_ret)
-				saved_ret = ret;
-
-			btrfs_page_set_error(fs_info, page, cur, iosize);
-			if (PageWriteback(page))
-				btrfs_page_clear_writeback(fs_info, page, cur,
-							   iosize);
-		}
-
+		submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
+				   cur - page_offset(page));
 		cur += iosize;
 		nr++;
 	}
+
+	btrfs_page_assert_not_dirty(fs_info, page);
+	*nr_ret = nr;
+	return 0;
+
+out_error:
 	/*
 	 * If we finish without problem, we should not only clear page dirty,
 	 * but also empty subpage dirty bits
 	 */
-	if (!has_error)
-		btrfs_page_assert_not_dirty(fs_info, page);
-	else
-		ret = saved_ret;
 	*nr_ret = nr;
 	return ret;
 }
@@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
  * Return 0 if everything goes well.
  * Return <0 for error.
  */
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      struct btrfs_bio_ctrl *bio_ctrl)
+static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct folio *folio = page_folio(page);
 	struct inode *inode = page->mapping->host;
@@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_SHIFT;
 
-	trace___extent_writepage(page, inode, wbc);
+	trace___extent_writepage(page, inode, bio_ctrl->wbc);
 
 	WARN_ON(!PageLocked(page));
 
@@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	}
 
 	if (!bio_ctrl->extent_locked) {
-		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
+		ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
 		if (ret == 1)
 			return 0;
 		if (ret)
 			goto done;
 	}
 
-	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
-				    &nr);
+	ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
 	if (ret == 1)
 		return 0;
 
@@ -1773,6 +1657,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	if (PageError(page))
 		end_extent_writepage(page, ret, page_start, page_end);
 	if (bio_ctrl->extent_locked) {
+		struct writeback_control *wbc = bio_ctrl->wbc;
+
 		/*
 		 * If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
 		 * the page can either be locked by lock_page() or
@@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
 
 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
 		btrfs_tree_unlock(eb);
-		if (!bio_ctrl->sync_io)
+		if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL)
 			return 0;
 		if (!flush) {
 			submit_write_bio(bio_ctrl, 0);
@@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb)
  * Unlike the work in write_one_eb(), we rely completely on extent locking.
  * Page locking is only utilized at minimum to keep the VMM code happy.
  */
-static int write_one_subpage_eb(struct extent_buffer *eb,
-				struct writeback_control *wbc,
-				struct btrfs_bio_ctrl *bio_ctrl)
+static void write_one_subpage_eb(struct extent_buffer *eb,
+				 struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct page *page = eb->pages[0];
-	blk_opf_t write_flags = wbc_to_write_flags(wbc);
 	bool no_dirty_ebs = false;
-	int ret;
 
 	prepare_eb_write(eb);
 
@@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
 
 	bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
 
-	ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
-			bio_ctrl, eb->start, page, eb->len,
-			eb->start - page_offset(page), 0, false);
-	if (ret) {
-		btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
-		set_btree_ioerr(page, eb);
-		unlock_page(page);
-
-		if (atomic_dec_and_test(&eb->io_pages))
-			end_extent_buffer_writeback(eb);
-		return -EIO;
-	}
+	submit_extent_page(bio_ctrl, eb->start, page, eb->len,
+			   eb->start - page_offset(page));
 	unlock_page(page);
 	/*
 	 * Submission finished without problem, if no range of the page is
 	 * dirty anymore, we have submitted a page.  Update nr_written in wbc.
 	 */
 	if (no_dirty_ebs)
-		wbc->nr_to_write--;
-	return ret;
+		bio_ctrl->wbc->nr_to_write--;
 }
 
-static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
-			struct writeback_control *wbc,
+static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 			struct btrfs_bio_ctrl *bio_ctrl)
 {
 	u64 disk_bytenr = eb->start;
 	int i, num_pages;
-	blk_opf_t write_flags = wbc_to_write_flags(wbc);
-	int ret = 0;
 
 	prepare_eb_write(eb);
 
@@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
-		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
-					 bio_ctrl, disk_bytenr, p,
-					 PAGE_SIZE, 0, 0, false);
-		if (ret) {
-			set_btree_ioerr(p, eb);
-			if (PageWriteback(p))
-				end_page_writeback(p);
-			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
-				end_extent_buffer_writeback(eb);
-			ret = -EIO;
-			break;
-		}
+		submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0);
 		disk_bytenr += PAGE_SIZE;
-		wbc->nr_to_write--;
+		bio_ctrl->wbc->nr_to_write--;
 		unlock_page(p);
 	}
-
-	if (unlikely(ret)) {
-		for (; i < num_pages; i++) {
-			struct page *p = eb->pages[i];
-			clear_page_dirty_for_io(p);
-			unlock_page(p);
-		}
-	}
-
-	return ret;
 }
 
 /*
@@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
  * Return >=0 for the number of submitted extent buffers.
  * Return <0 for fatal error.
  */
-static int submit_eb_subpage(struct page *page,
-			     struct writeback_control *wbc,
-			     struct btrfs_bio_ctrl *bio_ctrl)
+static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
 	int submitted = 0;
@@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page,
 			free_extent_buffer(eb);
 			goto cleanup;
 		}
-		ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
+		write_one_subpage_eb(eb, bio_ctrl);
 		free_extent_buffer(eb);
-		if (ret < 0)
-			goto cleanup;
 		submitted++;
 	}
 	return submitted;
@@ -2318,8 +2162,7 @@ static int submit_eb_subpage(struct page *page,
  * previous call.
  * Return <0 for fatal error.
  */
-static int submit_eb_page(struct page *page, struct writeback_control *wbc,
-			  struct btrfs_bio_ctrl *bio_ctrl,
+static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
 			  struct extent_buffer **eb_context)
 {
 	struct address_space *mapping = page->mapping;
@@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
 		return 0;
 
 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
-		return submit_eb_subpage(page, wbc, bio_ctrl);
+		return submit_eb_subpage(page, bio_ctrl);
 
 	spin_lock(&mapping->private_lock);
 	if (!PagePrivate(page)) {
@@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
 		 * If for_sync, this hole will be filled with
 		 * trasnsaction commit.
 		 */
-		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+		if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL &&
+		    !bio_ctrl->wbc->for_sync)
 			ret = -EAGAIN;
 		else
 			ret = 0;
@@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
 		btrfs_schedule_zone_finish_bg(cache, eb);
 		btrfs_put_block_group(cache);
 	}
-	ret = write_one_eb(eb, wbc, bio_ctrl);
+	write_one_eb(eb, bio_ctrl);
 	free_extent_buffer(eb);
-	if (ret < 0)
-		return ret;
 	return 1;
 }
 
@@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping,
 {
 	struct extent_buffer *eb_context = NULL;
 	struct btrfs_bio_ctrl bio_ctrl = {
+		.wbc = wbc,
+		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
 		.extent_locked = 0,
-		.sync_io = (wbc->sync_mode == WB_SYNC_ALL),
 	};
 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
 	int ret = 0;
@@ -2445,8 +2288,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 		for (i = 0; i < nr_folios; i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			ret = submit_eb_page(&folio->page, wbc, &bio_ctrl,
-					&eb_context);
+			ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context);
 			if (ret == 0)
 				continue;
 			if (ret < 0) {
@@ -2529,9 +2371,9 @@ int btree_write_cache_pages(struct address_space *mapping,
  * existing IO to complete.
  */
 static int extent_write_cache_pages(struct address_space *mapping,
-			     struct writeback_control *wbc,
 			     struct btrfs_bio_ctrl *bio_ctrl)
 {
+	struct writeback_control *wbc = bio_ctrl->wbc;
 	struct inode *inode = mapping->host;
 	int ret = 0;
 	int done = 0;
@@ -2632,7 +2474,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
 				continue;
 			}
 
-			ret = __extent_writepage(&folio->page, wbc, bio_ctrl);
+			ret = __extent_writepage(&folio->page, bio_ctrl);
 			if (ret < 0) {
 				done = 1;
 				break;
@@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 	u64 cur = start;
 	unsigned long nr_pages;
 	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
-	struct btrfs_bio_ctrl bio_ctrl = {
-		.extent_locked = 1,
-		.sync_io = 1,
-	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= start,
 		.range_end	= end + 1,
-		/* We're called from an async helper function */
-		.punt_to_cgroup	= 1,
 		.no_cgroup_owner = 1,
 	};
+	struct btrfs_bio_ctrl bio_ctrl = {
+		.wbc = &wbc_writepages,
+		/* We're called from an async helper function */
+		.opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
+			wbc_to_write_flags(&wbc_writepages),
+		.extent_locked = 1,
+	};
 
 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
 	nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
@@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 		ASSERT(PageLocked(page));
 		ASSERT(PageDirty(page));
 		clear_page_dirty_for_io(page);
-		ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
+		ret = __extent_writepage(page, &bio_ctrl);
 		ASSERT(ret <= 0);
 		if (ret < 0) {
 			found_error = true;
@@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	int ret = 0;
 	struct btrfs_bio_ctrl bio_ctrl = {
+		.wbc = wbc,
+		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
 		.extent_locked = 0,
-		.sync_io = (wbc->sync_mode == WB_SYNC_ALL),
 	};
 
 	/*
@@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping,
 	 * protect the write pointer updates.
 	 */
 	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
-	ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
+	ret = extent_write_cache_pages(mapping, &bio_ctrl);
 	submit_write_bio(&bio_ctrl, ret);
 	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
 	return ret;
@@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping,
 
 void extent_readahead(struct readahead_control *rac)
 {
-	struct btrfs_bio_ctrl bio_ctrl = { 0 };
+	struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
 	struct page *pagepool[16];
 	struct extent_map *em_cached = NULL;
 	u64 prev_em_start = (u64)-1;
@@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	struct page *page = eb->pages[0];
 	struct extent_state *cached_state = NULL;
 	struct btrfs_bio_ctrl bio_ctrl = {
+		.opf = REQ_OP_READ,
 		.mirror_num = mirror_num,
 		.parent_check = check,
 	};
-	int ret = 0;
+	int ret;
 
 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
 	ASSERT(PagePrivate(page));
@@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 			return ret;
 	}
 
-	ret = 0;
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
 	    PageUptodate(page) ||
 	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
 			      &cached_state);
-		return ret;
+		return 0;
 	}
 
 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
@@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
 
 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
-	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
-				 eb->start, page, eb->len,
-				 eb->start - page_offset(page), 0, true);
-	if (ret) {
-		/*
-		 * In the endio function, if we hit something wrong we will
-		 * increase the io_pages, so here we need to decrease it for
-		 * error path.
-		 */
-		atomic_dec(&eb->io_pages);
-	}
+	submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
+			   eb->start - page_offset(page));
 	submit_one_bio(&bio_ctrl);
-	if (ret || wait != WAIT_COMPLETE) {
+	if (wait != WAIT_COMPLETE) {
 		free_extent_state(cached_state);
-		return ret;
+		return 0;
 	}
 
 	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
 			EXTENT_LOCKED, &cached_state);
 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-		ret = -EIO;
-	return ret;
+		return -EIO;
+	return 0;
 }
 
 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
@@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 {
 	int i;
 	struct page *page;
-	int err;
-	int ret = 0;
 	int locked_pages = 0;
 	int all_uptodate = 1;
 	int num_pages;
 	unsigned long num_reads = 0;
 	struct btrfs_bio_ctrl bio_ctrl = {
+		.opf = REQ_OP_READ,
 		.mirror_num = mirror_num,
 		.parent_check = check,
 	};
@@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 		page = eb->pages[i];
 
 		if (!PageUptodate(page)) {
-			if (ret) {
-				atomic_dec(&eb->io_pages);
-				unlock_page(page);
-				continue;
-			}
-
 			ClearPageError(page);
-			err = submit_extent_page(REQ_OP_READ, NULL,
-					 &bio_ctrl, page_offset(page), page,
-					 PAGE_SIZE, 0, 0, false);
-			if (err) {
-				/*
-				 * We failed to submit the bio so it's the
-				 * caller's responsibility to perform cleanup
-				 * i.e unlock page/set error bit.
-				 */
-				ret = err;
-				SetPageError(page);
-				unlock_page(page);
-				atomic_dec(&eb->io_pages);
-			}
+			submit_extent_page(&bio_ctrl, page_offset(page), page,
+					   PAGE_SIZE, 0);
 		} else {
 			unlock_page(page);
 		}
@@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 
 	submit_one_bio(&bio_ctrl);
 
-	if (ret || wait != WAIT_COMPLETE)
-		return ret;
+	if (wait != WAIT_COMPLETE)
+		return 0;
 
 	for (i = 0; i < num_pages; i++) {
 		page = eb->pages[i];
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
-			ret = -EIO;
+			return -EIO;
 	}
 
-	return ret;
+	return 0;
 
 unlock_exit:
 	while (locked_pages > 0) {
@@ -4596,7 +4412,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 		page = eb->pages[locked_pages];
 		unlock_page(page);
 	}
-	return ret;
+	return 0;
 }
 
 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9e45b416a9c8..d1cd0a692f8e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -336,48 +336,6 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-/*
- * Locate the file_offset of @cur_disk_bytenr of a @bio.
- *
- * Bio of btrfs represents read range of
- * [bi_sector << 9, bi_sector << 9 + bi_size).
- * Knowing this, we can iterate through each bvec to locate the page belong to
- * @cur_disk_bytenr and get the file offset.
- *
- * @inode is used to determine if the bvec page really belongs to @inode.
- *
- * Return 0 if we can't find the file offset
- * Return >0 if we find the file offset and restore it to @file_offset_ret
- */
-static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
-				     u64 disk_bytenr, u64 *file_offset_ret)
-{
-	struct bvec_iter iter;
-	struct bio_vec bvec;
-	u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
-	int ret = 0;
-
-	bio_for_each_segment(bvec, bio, iter) {
-		struct page *page = bvec.bv_page;
-
-		if (cur > disk_bytenr)
-			break;
-		if (cur + bvec.bv_len <= disk_bytenr) {
-			cur += bvec.bv_len;
-			continue;
-		}
-		ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
-		if (page->mapping && page->mapping->host &&
-		    page->mapping->host == inode) {
-			ret = 1;
-			*file_offset_ret = page_offset(page) + bvec.bv_offset +
-					   disk_bytenr - cur;
-			break;
-		}
-	}
-	return ret;
-}
-
 /*
  * Lookup the checksum for the read bio in csum tree.
  *
@@ -387,17 +345,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
 {
 	struct btrfs_inode *inode = bbio->inode;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct extent_io_tree *io_tree = &inode->io_tree;
 	struct bio *bio = &bbio->bio;
 	struct btrfs_path *path;
 	const u32 sectorsize = fs_info->sectorsize;
 	const u32 csum_size = fs_info->csum_size;
 	u32 orig_len = bio->bi_iter.bi_size;
 	u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
-	u64 cur_disk_bytenr;
 	const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
-	int count = 0;
 	blk_status_t ret = BLK_STS_OK;
+	u32 bio_offset = 0;
 
 	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
 	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
@@ -448,28 +404,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
 		path->skip_locking = 1;
 	}
 
-	for (cur_disk_bytenr = orig_disk_bytenr;
-	     cur_disk_bytenr < orig_disk_bytenr + orig_len;
-	     cur_disk_bytenr += (count * sectorsize)) {
-		u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
-		unsigned int sector_offset;
-		u8 *csum_dst;
-
-		/*
-		 * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
-		 * we're calculating the offset to the bio start.
-		 *
-		 * Bio size is limited to UINT_MAX, thus unsigned int is large
-		 * enough to contain the raw result, not to mention the right
-		 * shifted result.
-		 */
-		ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
-		sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
-				fs_info->sectorsize_bits;
-		csum_dst = bbio->csum + sector_offset * csum_size;
+	while (bio_offset < orig_len) {
+		int count;
+		u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
+		u8 *csum_dst = bbio->csum +
+			(bio_offset >> fs_info->sectorsize_bits) * csum_size;
 
 		count = search_csum_tree(fs_info, path, cur_disk_bytenr,
-					 search_len, csum_dst);
+					 orig_len - bio_offset, csum_dst);
 		if (count < 0) {
 			ret = errno_to_blk_status(count);
 			if (bbio->csum != bbio->csum_inline)
@@ -494,14 +436,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
 
 			if (inode->root->root_key.objectid ==
 			    BTRFS_DATA_RELOC_TREE_OBJECTID) {
-				u64 file_offset;
-				int ret;
-
-				ret = search_file_offset_in_bio(bio,
-						&inode->vfs_inode,
-						cur_disk_bytenr, &file_offset);
-				if (ret)
-					set_extent_bits(io_tree, file_offset,
+				u64 file_offset = bbio->file_offset + bio_offset;
+
+				set_extent_bits(&inode->io_tree, file_offset,
 						file_offset + sectorsize - 1,
 						EXTENT_NODATASUM);
 			} else {
@@ -510,6 +447,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
 				cur_disk_bytenr, cur_disk_bytenr + sectorsize);
 			}
 		}
+		bio_offset += count * sectorsize;
 	}
 
 	btrfs_free_path(path);
@@ -660,7 +598,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
  * in is large enough to contain all csums.
  */
 int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
-			      u8 *csum_buf, unsigned long *csum_bitmap)
+			      u8 *csum_buf, unsigned long *csum_bitmap,
+			      bool search_commit)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_key key;
@@ -677,6 +616,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
 	if (!path)
 		return -ENOMEM;
 
+	if (search_commit) {
+		path->skip_locking = 1;
+		path->reada = READA_FORWARD;
+		path->search_commit_root = 1;
+	}
+
 	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 	key.offset = start;
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index cd7f2ae515c0..6be8725cd574 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
 			    struct list_head *list, int search_commit,
 			    bool nowait);
 int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
-			      u8 *csum_buf, unsigned long *csum_bitmap);
+			      u8 *csum_buf, unsigned long *csum_bitmap,
+			      bool search_commit);
 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 				     const struct btrfs_path *path,
 				     struct btrfs_file_extent_item *fi,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 24cd49229408..0d98fc5f6f44 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -24,6 +24,18 @@
 #define BTRFS_SUPER_INFO_SIZE			4096
 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 
+/*
+ * Number of metadata items necessary for an unlink operation:
+ *
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode
+ * 1 for the parent inode
+ */
+#define BTRFS_UNLINK_METADATA_UNITS		6
+
 /*
  * The reserved space at the beginning of each device.  It covers the primary
  * super block and leaves space for potential use by other tools like
@@ -193,11 +205,7 @@ enum {
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
 
-#ifdef CONFIG_BTRFS_DEBUG
-/*
- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
- */
-#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE		\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
@@ -210,23 +218,22 @@ enum {
 	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
 	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
 	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
-	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
+	 BTRFS_FEATURE_INCOMPAT_ZONED)
+
+#ifdef CONFIG_BTRFS_DEBUG
+	/*
+	 * Features under developmen like Extent tree v2 support is enabled
+	 * only under CONFIG_BTRFS_DEBUG.
+	 */
+#define BTRFS_FEATURE_INCOMPAT_SUPP		\
+	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\
 	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+
 #else
-#define BTRFS_FEATURE_INCOMPAT_SUPP			\
-	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
-	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
-	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
-	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
-	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
-	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
-	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
-	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
-	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
-	 BTRFS_FEATURE_INCOMPAT_ZONED)
+
+#define BTRFS_FEATURE_INCOMPAT_SUPP		\
+	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE)
+
 #endif
 
 #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
@@ -412,7 +419,6 @@ struct btrfs_fs_info {
 	 * Must be written and read while holding btrfs_fs_info::commit_root_sem.
 	 */
 	u64 last_reloc_trans;
-	u64 avg_delayed_ref_runtime;
 
 	/*
 	 * This is updated to the current trans every time a full commit is
@@ -638,7 +644,6 @@ struct btrfs_fs_info {
 	refcount_t scrub_workers_refcnt;
 	struct workqueue_struct *scrub_workers;
 	struct workqueue_struct *scrub_wr_completion_workers;
-	struct workqueue_struct *scrub_parity_workers;
 	struct btrfs_subpage_info *subpage_info;
 
 	struct btrfs_discard_ctl discard_ctl;
@@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves(
  * Use this if we would be adding new items, as we could split nodes as we cow
  * down the tree.
  */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info,
 						  unsigned num_items)
 {
 	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
@@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
  * Doing a truncate or a modification won't result in new nodes or leaves, just
  * what we need for COW.
  */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
 						 unsigned num_items)
 {
 	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b65c45b5d681..4c322b720a80 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -527,7 +527,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		u64 clear_start = 0, clear_len = 0, extent_start = 0;
-		bool should_throttle = false;
+		bool refill_delayed_refs_rsv = false;
 
 		fi = NULL;
 		leaf = path->nodes[0];
@@ -660,8 +660,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				/* No pending yet, add ourselves */
 				pending_del_slot = path->slots[0];
 				pending_del_nr = 1;
-			} else if (pending_del_nr &&
-				   path->slots[0] + 1 == pending_del_slot) {
+			} else if (path->slots[0] + 1 == pending_del_slot) {
 				/* Hop on the pending chunk */
 				pending_del_nr++;
 				pending_del_slot = path->slots[0];
@@ -686,10 +685,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
-			if (be_nice) {
-				if (btrfs_should_throttle_delayed_refs(trans))
-					should_throttle = true;
-			}
+			if (be_nice && btrfs_check_space_for_delayed_refs(fs_info))
+				refill_delayed_refs_rsv = true;
 		}
 
 		if (found_type == BTRFS_INODE_ITEM_KEY)
@@ -697,7 +694,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot ||
-		    should_throttle) {
+		    refill_delayed_refs_rsv) {
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -720,7 +717,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			 * actually allocate, so just bail if we're short and
 			 * let the normal reservation dance happen higher up.
 			 */
-			if (should_throttle) {
+			if (refill_delayed_refs_rsv) {
 				ret = btrfs_delayed_refs_rsv_refill(fs_info,
 							BTRFS_RESERVE_NO_FLUSH);
 				if (ret) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 00ade644579f..7fcafcc5292c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -79,6 +79,7 @@ struct btrfs_iget_args {
 struct btrfs_dio_data {
 	ssize_t submitted;
 	struct extent_changeset *data_reserved;
+	struct btrfs_ordered_extent *ordered;
 	bool data_space_reserved;
 	bool nocow_done;
 };
@@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
 again:
 	will_compress = 0;
 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
-	nr_pages = min_t(unsigned long, nr_pages,
-			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
+	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
 
 	/*
 	 * we don't want to send crud past the end of i_size through
@@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
 	ret = cow_file_range(inode, locked_page, start, end, &page_started,
 			     &nr_written, 0, NULL);
 	/* Inline extent inserted, page gets unlocked and everything is done */
-	if (page_started) {
-		ret = 0;
-		goto out;
-	}
+	if (page_started)
+		return 0;
+
 	if (ret < 0) {
 		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
 		if (locked_page) {
@@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
 			end_extent_writepage(locked_page, ret, page_start, page_end);
 			unlock_page(locked_page);
 		}
-		goto out;
+		return ret;
 	}
 
-	ret = extent_write_locked_range(&inode->vfs_inode, start, end);
 	/* All pages will be unlocked, including @locked_page */
-out:
-	kfree(async_extent);
-	return ret;
+	return extent_write_locked_range(&inode->vfs_inode, start, end);
 }
 
 static int submit_one_async_extent(struct btrfs_inode *inode,
@@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
 	u64 start = async_extent->start;
 	u64 end = async_extent->start + async_extent->ram_size - 1;
 
+	if (async_chunk->blkcg_css)
+		kthread_associate_blkcg(async_chunk->blkcg_css);
+
 	/*
 	 * If async_chunk->locked_page is in the async_extent range, we need to
 	 * handle it.
@@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
 	lock_extent(io_tree, start, end, NULL);
 
 	/* We have fall back to uncompressed write */
-	if (!async_extent->pages)
-		return submit_uncompressed_range(inode, async_extent, locked_page);
+	if (!async_extent->pages) {
+		ret = submit_uncompressed_range(inode, async_extent, locked_page);
+		goto done;
+	}
 
 	ret = btrfs_reserve_extent(root, async_extent->ram_size,
 				   async_extent->compressed_size,
@@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
 	extent_clear_unlock_delalloc(inode, start, end,
 			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 			PAGE_UNLOCK | PAGE_START_WRITEBACK);
-	if (btrfs_submit_compressed_write(inode, start,	/* file_offset */
+
+	btrfs_submit_compressed_write(inode, start,	/* file_offset */
 			    async_extent->ram_size,	/* num_bytes */
 			    ins.objectid,		/* disk_bytenr */
 			    ins.offset,			/* compressed_len */
 			    async_extent->pages,	/* compressed_pages */
 			    async_extent->nr_pages,
-			    async_chunk->write_flags,
-			    async_chunk->blkcg_css, true)) {
-		const u64 start = async_extent->start;
-		const u64 end = start + async_extent->ram_size - 1;
-
-		btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
-
-		extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
-					     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
-		free_async_extent_pages(async_extent);
-	}
+			    async_chunk->write_flags, true);
 	*alloc_hint = ins.objectid + ins.offset;
+done:
+	if (async_chunk->blkcg_css)
+		kthread_associate_blkcg(NULL);
 	kfree(async_extent);
 	return ret;
 
@@ -1086,8 +1081,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
 				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
 				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
 	free_async_extent_pages(async_extent);
-	kfree(async_extent);
-	return ret;
+	goto done;
 }
 
 /*
@@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
 		if (blkcg_css != blkcg_root_css) {
 			css_get(blkcg_css);
 			async_chunk[i].blkcg_css = blkcg_css;
+			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
 		} else {
 			async_chunk[i].blkcg_css = NULL;
 		}
@@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 }
 
 /*
- * Split an extent_map at [start, start + len]
+ * Split off the first pre bytes from the extent_map at [start, start + len]
  *
  * This function is intended to be used only for extract_ordered_extent().
  */
-static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
-			  u64 pre, u64 post)
+static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre)
 {
 	struct extent_map_tree *em_tree = &inode->extent_tree;
 	struct extent_map *em;
 	struct extent_map *split_pre = NULL;
 	struct extent_map *split_mid = NULL;
-	struct extent_map *split_post = NULL;
 	int ret = 0;
 	unsigned long flags;
 
-	/* Sanity check */
-	if (pre == 0 && post == 0)
-		return 0;
+	ASSERT(pre != 0);
+	ASSERT(pre < len);
 
 	split_pre = alloc_extent_map();
-	if (pre)
-		split_mid = alloc_extent_map();
-	if (post)
-		split_post = alloc_extent_map();
-	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+	if (!split_pre)
+		return -ENOMEM;
+	split_mid = alloc_extent_map();
+	if (!split_mid) {
 		ret = -ENOMEM;
-		goto out;
+		goto out_free_pre;
 	}
 
-	ASSERT(pre + post < len);
-
 	lock_extent(&inode->io_tree, start, start + len - 1, NULL);
 	write_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
@@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 
 	/* First, replace the em with a new extent_map starting from * em->start */
 	split_pre->start = em->start;
-	split_pre->len = (pre ? pre : em->len - post);
+	split_pre->len = pre;
 	split_pre->orig_start = split_pre->start;
 	split_pre->block_start = em->block_start;
 	split_pre->block_len = split_pre->len;
@@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 
 	/*
 	 * Now we only have an extent_map at:
-	 *     [em->start, em->start + pre] if pre != 0
-	 *     [em->start, em->start + em->len - post] if pre == 0
-	 */
-
-	if (pre) {
-		/* Insert the middle extent_map */
-		split_mid->start = em->start + pre;
-		split_mid->len = em->len - pre - post;
-		split_mid->orig_start = split_mid->start;
-		split_mid->block_start = em->block_start + pre;
-		split_mid->block_len = split_mid->len;
-		split_mid->orig_block_len = split_mid->block_len;
-		split_mid->ram_bytes = split_mid->len;
-		split_mid->flags = flags;
-		split_mid->compress_type = em->compress_type;
-		split_mid->generation = em->generation;
-		add_extent_mapping(em_tree, split_mid, 1);
-	}
-
-	if (post) {
-		split_post->start = em->start + em->len - post;
-		split_post->len = post;
-		split_post->orig_start = split_post->start;
-		split_post->block_start = em->block_start + em->len - post;
-		split_post->block_len = split_post->len;
-		split_post->orig_block_len = split_post->block_len;
-		split_post->ram_bytes = split_post->len;
-		split_post->flags = flags;
-		split_post->compress_type = em->compress_type;
-		split_post->generation = em->generation;
-		add_extent_mapping(em_tree, split_post, 1);
-	}
+	 *     [em->start, em->start + pre]
+	 */
+
+	/* Insert the middle extent_map. */
+	split_mid->start = em->start + pre;
+	split_mid->len = em->len - pre;
+	split_mid->orig_start = split_mid->start;
+	split_mid->block_start = em->block_start + pre;
+	split_mid->block_len = split_mid->len;
+	split_mid->orig_block_len = split_mid->block_len;
+	split_mid->ram_bytes = split_mid->len;
+	split_mid->flags = flags;
+	split_mid->compress_type = em->compress_type;
+	split_mid->generation = em->generation;
+	add_extent_mapping(em_tree, split_mid, 1);
 
 	/* Once for us */
 	free_extent_map(em);
@@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 out_unlock:
 	write_unlock(&em_tree->lock);
 	unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
-out:
-	free_extent_map(split_pre);
 	free_extent_map(split_mid);
-	free_extent_map(split_post);
-
+out_free_pre:
+	free_extent_map(split_pre);
 	return ret;
 }
 
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+				 struct btrfs_ordered_extent *ordered)
 {
 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 	u64 len = bbio->bio.bi_iter.bi_size;
 	struct btrfs_inode *inode = bbio->inode;
-	struct btrfs_ordered_extent *ordered;
-	u64 file_len;
-	u64 end = start + len;
-	u64 ordered_end;
-	u64 pre, post;
+	u64 ordered_len = ordered->num_bytes;
 	int ret = 0;
 
-	ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
-	if (WARN_ON_ONCE(!ordered))
-		return BLK_STS_IOERR;
+	/* Must always be called for the beginning of an ordered extent. */
+	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+		return -EINVAL;
 
-	/* No need to split */
+	/* No need to split if the ordered extent covers the entire bio. */
 	if (ordered->disk_num_bytes == len)
-		goto out;
-
-	/* We cannot split once end_bio'd ordered extent */
-	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* We cannot split a compressed ordered extent */
-	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
-	/* bio must be in one ordered extent */
-	if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* Checksum list should be empty */
-	if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	file_len = ordered->num_bytes;
-	pre = start - ordered->disk_bytenr;
-	post = ordered_end - end;
+		return 0;
 
-	ret = btrfs_split_ordered_extent(ordered, pre, post);
+	ret = btrfs_split_ordered_extent(ordered, len);
 	if (ret)
-		goto out;
-	ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
+		return ret;
 
-out:
-	btrfs_put_ordered_extent(ordered);
+	/*
+	 * Don't split the extent_map for NOCOW extents, as we're writing into
+	 * a pre-existing one.
+	 */
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+		return 0;
 
-	return errno_to_blk_status(ret);
+	return split_extent_map(inode, bbio->file_offset, ordered_len, len);
 }
 
 /*
@@ -3370,13 +3311,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 	return 0;
 }
 
-static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
-{
-	u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
-
-	return csums + offset_in_sectors * fs_info->csum_size;
-}
-
 /*
  * Verify the checksum of a single data sector.
  *
@@ -3414,7 +3348,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 		return true;
 	}
 
-	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
+	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
+				fs_info->csum_size;
 	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
 				    csum_expected))
 		goto zeroit;
@@ -3694,6 +3629,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
+				iput(inode);
 				goto out;
 			}
 			btrfs_debug(fs_info, "auto deleting %Lu",
@@ -3701,8 +3637,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 			ret = btrfs_del_orphan_item(trans, root,
 						    found_key.objectid);
 			btrfs_end_transaction(trans);
-			if (ret)
+			if (ret) {
+				iput(inode);
 				goto out;
+			}
 			continue;
 		}
 
@@ -4264,15 +4202,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
 {
 	struct btrfs_root *root = dir->root;
 
-	/*
-	 * 1 for the possible orphan item
-	 * 1 for the dir item
-	 * 1 for the dir index
-	 * 1 for the inode ref
-	 * 1 for the inode
-	 * 1 for the parent inode
-	 */
-	return btrfs_start_transaction_fallback_global_rsv(root, 6);
+	return btrfs_start_transaction_fallback_global_rsv(root,
+						   BTRFS_UNLINK_METADATA_UNITS);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -5246,7 +5177,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
-	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
+	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 	int ret;
 
 	/*
@@ -5284,7 +5215,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		trans->bytes_reserved = delayed_refs_extra;
 		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
-					delayed_refs_extra, 1);
+					delayed_refs_extra, true);
 	}
 	return trans;
 }
@@ -5294,7 +5225,7 @@ void btrfs_evict_inode(struct inode *inode)
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_block_rsv *rsv;
+	struct btrfs_block_rsv *rsv = NULL;
 	int ret;
 
 	trace_btrfs_inode_evict(inode);
@@ -5311,18 +5242,18 @@ void btrfs_evict_inode(struct inode *inode)
 	    ((btrfs_root_refs(&root->root_item) != 0 &&
 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
-		goto no_delete;
+		goto out;
 
 	if (is_bad_inode(inode))
-		goto no_delete;
+		goto out;
 
 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
-		goto no_delete;
+		goto out;
 
 	if (inode->i_nlink > 0) {
 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
 		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
-		goto no_delete;
+		goto out;
 	}
 
 	/*
@@ -5331,7 +5262,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 */
 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
 	if (ret)
-		goto no_delete;
+		goto out;
 
 	/*
 	 * This drops any pending insert or delete operations we have for this
@@ -5343,7 +5274,7 @@ void btrfs_evict_inode(struct inode *inode)
 
 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv)
-		goto no_delete;
+		goto out;
 	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
 	rsv->failfast = true;
 
@@ -5359,16 +5290,21 @@ void btrfs_evict_inode(struct inode *inode)
 
 		trans = evict_refill_and_join(root, rsv);
 		if (IS_ERR(trans))
-			goto free_rsv;
+			goto out;
 
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, &control);
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		btrfs_end_transaction(trans);
-		btrfs_btree_balance_dirty(fs_info);
+		/*
+		 * We have not added new delayed items for our inode after we
+		 * have flushed its delayed items, so no need to throttle on
+		 * delayed items. However we have modified extent buffers.
+		 */
+		btrfs_btree_balance_dirty_nodelay(fs_info);
 		if (ret && ret != -ENOSPC && ret != -EAGAIN)
-			goto free_rsv;
+			goto out;
 		else if (!ret)
 			break;
 	}
@@ -5390,9 +5326,8 @@ void btrfs_evict_inode(struct inode *inode)
 		btrfs_end_transaction(trans);
 	}
 
-free_rsv:
+out:
 	btrfs_free_block_rsv(fs_info, rsv);
-no_delete:
 	/*
 	 * If we didn't successfully delete, the orphan item will still be in
 	 * the tree and we'll retry on the next mount. Again, we might also want
@@ -6984,6 +6919,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 }
 
 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+						  struct btrfs_dio_data *dio_data,
 						  const u64 start,
 						  const u64 len,
 						  const u64 orig_start,
@@ -6994,7 +6930,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 						  const int type)
 {
 	struct extent_map *em = NULL;
-	int ret;
+	struct btrfs_ordered_extent *ordered;
 
 	if (type != BTRFS_ORDERED_NOCOW) {
 		em = create_io_em(inode, start, len, orig_start, block_start,
@@ -7004,18 +6940,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 		if (IS_ERR(em))
 			goto out;
 	}
-	ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
-				       block_len, 0,
-				       (1 << type) |
-				       (1 << BTRFS_ORDERED_DIRECT),
-				       BTRFS_COMPRESS_NONE);
-	if (ret) {
+	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
+					     block_start, block_len, 0,
+					     (1 << type) |
+					     (1 << BTRFS_ORDERED_DIRECT),
+					     BTRFS_COMPRESS_NONE);
+	if (IS_ERR(ordered)) {
 		if (em) {
 			free_extent_map(em);
 			btrfs_drop_extent_map_range(inode, start,
 						    start + len - 1, false);
 		}
-		em = ERR_PTR(ret);
+		em = ERR_CAST(ordered);
+	} else {
+		ASSERT(!dio_data->ordered);
+		dio_data->ordered = ordered;
 	}
  out:
 
@@ -7023,6 +6962,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 }
 
 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+						  struct btrfs_dio_data *dio_data,
 						  u64 start, u64 len)
 {
 	struct btrfs_root *root = inode->root;
@@ -7038,7 +6978,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 	if (ret)
 		return ERR_PTR(ret);
 
-	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
 				     ins.objectid, ins.offset, ins.offset,
 				     ins.offset, BTRFS_ORDERED_REGULAR);
 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -7384,7 +7324,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 		}
 		space_reserved = true;
 
-		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
 					      orig_start, block_start,
 					      len, orig_block_len,
 					      ram_bytes, type);
@@ -7430,7 +7370,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 			goto out;
 		space_reserved = true;
 
-		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
@@ -7737,6 +7677,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 				      pos + length - 1, NULL);
 		ret = -ENOTBLK;
 	}
+	if (write) {
+		btrfs_put_ordered_extent(dio_data->ordered);
+		dio_data->ordered = NULL;
+	}
 
 	if (write)
 		extent_changeset_free(dio_data->data_reserved);
@@ -7776,14 +7720,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
 		container_of(bbio, struct btrfs_dio_private, bbio);
 	struct btrfs_dio_data *dio_data = iter->private;
 
-	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
+	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+		       btrfs_dio_end_io, bio->bi_private);
+	bbio->inode = BTRFS_I(iter->inode);
 	bbio->file_offset = file_offset;
 
 	dip->file_offset = file_offset;
 	dip->bytes = bio->bi_iter.bi_size;
 
 	dio_data->submitted += bio->bi_iter.bi_size;
-	btrfs_submit_bio(bio, 0);
+
+	/*
+	 * Check if we are doing a partial write.  If we are, we need to split
+	 * the ordered extent to match the submitted bio.  Hang on to the
+	 * remaining unfinishable ordered_extent in dio_data so that it can be
+	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
+	 * remaining pages is blocked on the outstanding ordered extent.
+	 */
+	if (iter->flags & IOMAP_WRITE) {
+		int ret;
+
+		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
+		if (ret) {
+			btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+			return;
+		}
+	}
+
+	btrfs_submit_bio(bbio, 0);
 }
 
 static const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -7798,7 +7762,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
 
 ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
 {
-	struct btrfs_dio_data data;
+	struct btrfs_dio_data data = { 0 };
 
 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 			    IOMAP_DIO_PARTIAL, &data, done_before);
@@ -7807,7 +7771,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be
 struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 				  size_t done_before)
 {
-	struct btrfs_dio_data data;
+	struct btrfs_dio_data data = { 0 };
 
 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 			    IOMAP_DIO_PARTIAL, &data, done_before);
@@ -9917,8 +9881,6 @@ static ssize_t btrfs_encoded_read_inline(
 }
 
 struct btrfs_encoded_read_private {
-	struct btrfs_inode *inode;
-	u64 file_offset;
 	wait_queue_head_t wait;
 	atomic_t pending;
 	blk_status_t status;
@@ -9948,45 +9910,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					  u64 file_offset, u64 disk_bytenr,
 					  u64 disk_io_size, struct page **pages)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_encoded_read_private priv = {
-		.inode = inode,
-		.file_offset = file_offset,
 		.pending = ATOMIC_INIT(1),
 	};
 	unsigned long i = 0;
-	u64 cur = 0;
+	struct btrfs_bio *bbio;
 
 	init_waitqueue_head(&priv.wait);
-	/* Submit bios for the extent, splitting due to bio limits as necessary. */
-	while (cur < disk_io_size) {
-		struct bio *bio = NULL;
-		u64 remaining = disk_io_size - cur;
-
-		while (bio || remaining) {
-			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
-
-			if (!bio) {
-				bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
-						      inode,
-						      btrfs_encoded_read_endio,
-						      &priv);
-				bio->bi_iter.bi_sector =
-					(disk_bytenr + cur) >> SECTOR_SHIFT;
-			}
 
-			if (!bytes ||
-			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
-				atomic_inc(&priv.pending);
-				btrfs_submit_bio(bio, 0);
-				bio = NULL;
-				continue;
-			}
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+			       btrfs_encoded_read_endio, &priv);
+	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->inode = inode;
 
-			i++;
-			cur += bytes;
-			remaining -= bytes;
+	do {
+		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
+
+		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
+			atomic_inc(&priv.pending);
+			btrfs_submit_bio(bbio, 0);
+
+			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+					       btrfs_encoded_read_endio, &priv);
+			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+			bbio->inode = inode;
+			continue;
 		}
-	}
+
+		i++;
+		disk_bytenr += bytes;
+		disk_io_size -= bytes;
+	} while (disk_io_size);
+
+	atomic_inc(&priv.pending);
+	btrfs_submit_bio(bbio, 0);
 
 	if (atomic_dec_return(&priv.pending))
 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
@@ -10407,13 +10365,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 
 	btrfs_delalloc_release_extents(inode, num_bytes);
 
-	if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
-					  ins.offset, pages, nr_pages, 0, NULL,
-					  false)) {
-		btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
-		ret = -EIO;
-		goto out_pages;
-	}
+	btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+					  ins.offset, pages, nr_pages, 0, false);
 	ret = orig_count;
 	goto out;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 870528d87526..3a496b0d3d2b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
  * acquire the lock.
  */
 
-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+void btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
 {
-	int ret;
-
-	ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
-	if (ret)
-		return ret;
-
 	atomic_set(&lock->readers, 0);
+	atomic_set(&lock->writers, 0);
 	init_waitqueue_head(&lock->pending_readers);
 	init_waitqueue_head(&lock->pending_writers);
-
-	return 0;
-}
-
-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
-{
-	percpu_counter_destroy(&lock->writers);
 }
 
 /* Return true if acquisition is successful, false otherwise */
@@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
 	if (atomic_read(&lock->readers))
 		return false;
 
-	percpu_counter_inc(&lock->writers);
+	atomic_inc(&lock->writers);
 
 	/* Ensure writers count is updated before we check for pending readers */
-	smp_mb();
+	smp_mb__after_atomic();
 	if (atomic_read(&lock->readers)) {
 		btrfs_drew_write_unlock(lock);
 		return false;
@@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
 
 void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
 {
-	percpu_counter_dec(&lock->writers);
+	atomic_dec(&lock->writers);
 	cond_wake_up(&lock->pending_readers);
 }
 
@@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
 	 */
 	smp_mb__after_atomic();
 
-	wait_event(lock->pending_readers,
-		   percpu_counter_sum(&lock->writers) == 0);
+	wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0);
 }
 
 void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 11c2269b4b6f..edb9b4a0dba1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
 
 struct btrfs_drew_lock {
 	atomic_t readers;
-	struct percpu_counter writers;
+	atomic_t writers;
 	wait_queue_head_t pending_writers;
 	wait_queue_head_t pending_readers;
 };
 
-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
+void btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
 void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
 bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
 void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index de3e18bce24a..00328c856be6 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca
 	return cache->size;
 }
 
-static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
-{
-	return cache->size >= cache->max_size;
-}
-
 static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
 					      struct btrfs_lru_cache *cache)
 {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 71f6d8302d50..3a095b9c6373 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -17,6 +17,7 @@
 #include "compression.h"
 #include "ctree.h"
 #include "super.h"
+#include "btrfs_inode.h"
 
 #define LZO_LEN	4
 
@@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+	const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
 	const u32 sectorsize = fs_info->sectorsize;
 	char *kaddr;
 	int ret;
@@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 			 */
 			btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
 					seg_len);
-			ret = -EIO;
-			goto out;
+			return -EIO;
 		}
 
 		/* Copy the compressed segment payload into workspace */
@@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 					    workspace->buf, &out_len);
 		if (ret != LZO_E_OK) {
 			btrfs_err(fs_info, "failed to decompress");
-			ret = -EIO;
-			goto out;
+			return -EIO;
 		}
 
 		/* Copy the data into inode pages */
@@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 		/* All data read, exit */
 		if (ret == 0)
-			goto out;
+			return 0;
 		ret = 0;
 
 		/* Check if the sector has enough space for a segment header */
@@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		/* Skip the padding zeros */
 		cur_in += sector_bytes_left;
 	}
-out:
-	if (!ret)
-		zero_fill_bio(cb->orig_bio);
-	return ret;
+
+	return 0;
 }
 
 int lzo_decompress(struct list_head *ws, const u8 *data_in,
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index fde5aaa6e7c9..310a05cf95ef 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
 #endif
 
 #ifdef CONFIG_BTRFS_ASSERT
-void __cold btrfs_assertfail(const char *expr, const char *file, int line)
+void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line)
 {
 	pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
 	BUG();
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 8c516ee58ff9..ac2d1982ba3d 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -160,7 +160,7 @@ do {								\
 } while (0)
 
 #ifdef CONFIG_BTRFS_ASSERT
-void __cold btrfs_assertfail(const char *expr, const char *file, int line);
+void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line);
 
 #define ASSERT(expr)						\
 	(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6c24b69e2d0a..a9778a91511e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * @compress_type:   Compression algorithm used for data.
  *
  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
- * tree is given a single reference on the ordered extent that was inserted.
+ * tree is given a single reference on the ordered extent that was inserted, and
+ * the returned pointer is given a second reference.
  *
- * Return: 0 or -ENOMEM.
+ * Return: the new ordered extent or error pointer.
  */
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
-			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-			     u64 disk_num_bytes, u64 offset, unsigned flags,
-			     int compress_type)
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
+			struct btrfs_inode *inode, u64 file_offset,
+			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+			u64 disk_num_bytes, u64 offset, unsigned long flags,
+			int compress_type)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
 		/* For nocow write, we can release the qgroup rsv right now */
 		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
 		if (ret < 0)
-			return ret;
+			return ERR_PTR(ret);
 		ret = 0;
 	} else {
 		/*
@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
 		 */
 		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
 		if (ret < 0)
-			return ret;
+			return ERR_PTR(ret);
 	}
 	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 	if (!entry)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	entry->file_offset = file_offset;
 	entry->num_bytes = num_bytes;
@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
 	btrfs_mod_outstanding_extents(inode, 1);
 	spin_unlock(&inode->lock);
 
+	/* One ref for the returned entry to match semantics of lookup. */
+	refcount_inc(&entry->refs);
+
+	return entry;
+}
+
+/*
+ * Add a new btrfs_ordered_extent for the range, but drop the reference instead
+ * of returning it to the caller.
+ */
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+			     u64 disk_num_bytes, u64 offset, unsigned long flags,
+			     int compress_type)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
+					     ram_bytes, disk_bytenr,
+					     disk_num_bytes, offset, flags,
+					     compress_type);
+
+	if (IS_ERR(ordered))
+		return PTR_ERR(ordered);
+	btrfs_put_ordered_extent(ordered);
+
 	return 0;
 }
 
@@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
 	return false;
 }
 
-
-static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
-				u64 len)
-{
-	struct inode *inode = ordered->inode;
-	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
-	u64 file_offset = ordered->file_offset + pos;
-	u64 disk_bytenr = ordered->disk_bytenr + pos;
-	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
-
-	/*
-	 * The splitting extent is already counted and will be added again in
-	 * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
-	 */
-	percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
-				 fs_info->delalloc_batch);
-	WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
-	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
-					disk_bytenr, len, 0, flags,
-					ordered->compress_type);
-}
-
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
-				u64 post)
+/* Split out a new ordered extent for this first @len bytes of @ordered. */
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
 {
 	struct inode *inode = ordered->inode;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-	struct rb_node *node;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int ret = 0;
+	u64 file_offset = ordered->file_offset;
+	u64 disk_bytenr = ordered->disk_bytenr;
+	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+	struct rb_node *node;
 
 	trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
 
+	ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
+
+	/*
+	 * The entire bio must be covered by the ordered extent, but we can't
+	 * reduce the original extent to a zero length either.
+	 */
+	if (WARN_ON_ONCE(len >= ordered->num_bytes))
+		return -EINVAL;
+	/* We cannot split once ordered extent is past end_bio. */
+	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
+		return -EINVAL;
+	/* We cannot split a compressed ordered extent. */
+	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
+		return -EINVAL;
+	/* Checksum list should be empty. */
+	if (WARN_ON_ONCE(!list_empty(&ordered->list)))
+		return -EINVAL;
+
 	spin_lock_irq(&tree->lock);
 	/* Remove from tree once */
 	node = &ordered->rb_node;
@@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
 	if (tree->last == node)
 		tree->last = NULL;
 
-	ordered->file_offset += pre;
-	ordered->disk_bytenr += pre;
-	ordered->num_bytes -= (pre + post);
-	ordered->disk_num_bytes -= (pre + post);
-	ordered->bytes_left -= (pre + post);
+	ordered->file_offset += len;
+	ordered->disk_bytenr += len;
+	ordered->num_bytes -= len;
+	ordered->disk_num_bytes -= len;
+	ordered->bytes_left -= len;
 
 	/* Re-insert the node */
 	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
@@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
 
 	spin_unlock_irq(&tree->lock);
 
-	if (pre)
-		ret = clone_ordered_extent(ordered, 0, pre);
-	if (ret == 0 && post)
-		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
-					   post);
+	/*
+	 * The splitting extent is already counted and will be added again in
+	 * btrfs_add_ordered_extent(). Subtract len to avoid double counting.
+	 */
+	percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
 
-	return ret;
+	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+					disk_bytenr, len, 0, flags,
+					ordered->compress_type);
 }
 
 int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index eb40cb39f842..f0f1138d23c3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
 				    struct btrfs_ordered_extent **cached,
 				    u64 file_offset, u64 io_size);
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
+			struct btrfs_inode *inode, u64 file_offset,
+			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+			u64 disk_num_bytes, u64 offset, unsigned long flags,
+			int compress_type);
 int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
 			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-			     u64 disk_num_bytes, u64 offset, unsigned flags,
+			     u64 disk_num_bytes, u64 offset, unsigned long flags,
 			     int compress_type);
 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
 			   struct btrfs_ordered_sum *sum);
@@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
 					struct extent_state **cached_state);
 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
 				  struct extent_state **cached_state);
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
-			       u64 post);
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len);
 int __init ordered_data_init(void);
 void __cold ordered_data_exit(void);
 
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 642828c1b299..2fab37f062de 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 {
-	u64 num = rbio->bioc->raid_map[0];
+	u64 num = rbio->bioc->full_stripe_logical;
 
 	/*
 	 * we shift down quite a bit.  We're using byte
@@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_stripe_hash_table *table;
-	unsigned long flags;
 
 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 		return;
 
 	table = rbio->bioc->fs_info->stripe_hash_table;
 
-	spin_lock_irqsave(&table->cache_lock, flags);
+	spin_lock(&table->cache_lock);
 	__remove_rbio_from_cache(rbio);
-	spin_unlock_irqrestore(&table->cache_lock, flags);
+	spin_unlock(&table->cache_lock);
 }
 
 /*
@@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 {
 	struct btrfs_stripe_hash_table *table;
-	unsigned long flags;
 	struct btrfs_raid_bio *rbio;
 
 	table = info->stripe_hash_table;
 
-	spin_lock_irqsave(&table->cache_lock, flags);
+	spin_lock(&table->cache_lock);
 	while (!list_empty(&table->stripe_cache)) {
 		rbio = list_entry(table->stripe_cache.next,
 				  struct btrfs_raid_bio,
 				  stripe_cache);
 		__remove_rbio_from_cache(rbio);
 	}
-	spin_unlock_irqrestore(&table->cache_lock, flags);
+	spin_unlock(&table->cache_lock);
 }
 
 /*
@@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 static void cache_rbio(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_stripe_hash_table *table;
-	unsigned long flags;
 
 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 		return;
 
 	table = rbio->bioc->fs_info->stripe_hash_table;
 
-	spin_lock_irqsave(&table->cache_lock, flags);
+	spin_lock(&table->cache_lock);
 	spin_lock(&rbio->bio_list_lock);
 
 	/* bump our ref if we were not in the list before */
@@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
 			__remove_rbio_from_cache(found);
 	}
 
-	spin_unlock_irqrestore(&table->cache_lock, flags);
+	spin_unlock(&table->cache_lock);
 }
 
 /*
@@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len)
  */
 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 {
-	unsigned long flags;
 	unsigned long size = rbio->bio_list_bytes;
 	int ret = 1;
 
-	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+	spin_lock(&rbio->bio_list_lock);
 	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
 		ret = 0;
 	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
-	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+	spin_unlock(&rbio->bio_list_lock);
 
 	return ret;
 }
@@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
 		return 0;
 
-	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
+	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
 		return 0;
 
 	/* we can't merge with different operations */
@@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 	struct btrfs_stripe_hash *h;
 	struct btrfs_raid_bio *cur;
 	struct btrfs_raid_bio *pending;
-	unsigned long flags;
 	struct btrfs_raid_bio *freeit = NULL;
 	struct btrfs_raid_bio *cache_drop = NULL;
 	int ret = 0;
 
 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
 
-	spin_lock_irqsave(&h->lock, flags);
+	spin_lock(&h->lock);
 	list_for_each_entry(cur, &h->hash_list, hash_list) {
-		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
+		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
 			continue;
 
 		spin_lock(&cur->bio_list_lock);
@@ -724,7 +719,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 	refcount_inc(&rbio->refs);
 	list_add(&rbio->hash_list, &h->hash_list);
 out:
-	spin_unlock_irqrestore(&h->lock, flags);
+	spin_unlock(&h->lock);
 	if (cache_drop)
 		remove_rbio_from_cache(cache_drop);
 	if (freeit)
@@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 {
 	int bucket;
 	struct btrfs_stripe_hash *h;
-	unsigned long flags;
 	int keep_cache = 0;
 
 	bucket = rbio_bucket(rbio);
@@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 	if (list_empty(&rbio->plug_list))
 		cache_rbio(rbio);
 
-	spin_lock_irqsave(&h->lock, flags);
+	spin_lock(&h->lock);
 	spin_lock(&rbio->bio_list_lock);
 
 	if (!list_empty(&rbio->hash_list)) {
@@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 			list_add(&next->hash_list, &h->hash_list);
 			refcount_inc(&next->refs);
 			spin_unlock(&rbio->bio_list_lock);
-			spin_unlock_irqrestore(&h->lock, flags);
+			spin_unlock(&h->lock);
 
 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
 				start_async_work(next, recover_rbio_work_locked);
@@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 	}
 done:
 	spin_unlock(&rbio->bio_list_lock);
-	spin_unlock_irqrestore(&h->lock, flags);
+	spin_unlock(&h->lock);
 
 done_nolock:
 	if (!keep_cache)
@@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
 	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
 	ASSERT(index >= 0 && index < rbio->nr_sectors);
 
-	spin_lock_irq(&rbio->bio_list_lock);
+	spin_lock(&rbio->bio_list_lock);
 	sector = &rbio->bio_sectors[index];
 	if (sector->page || bio_list_only) {
 		/* Don't return sector without a valid page pointer */
 		if (!sector->page)
 			sector = NULL;
-		spin_unlock_irq(&rbio->bio_list_lock);
+		spin_unlock(&rbio->bio_list_lock);
 		return sector;
 	}
-	spin_unlock_irq(&rbio->bio_list_lock);
+	spin_unlock(&rbio->bio_list_lock);
 
 	return &rbio->stripe_sectors[index];
 }
@@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 					 struct btrfs_io_context *bioc)
 {
-	const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
 	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
 	const unsigned int num_pages = stripe_npages * real_stripes;
 	const unsigned int stripe_nsectors =
@@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
 	bio->bi_iter.bi_sector = disk_start >> 9;
 	bio->bi_private = rbio;
 
-	bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+	__bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
 	bio_list_add(bio_list, bio);
 	return 0;
 }
@@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
-		     rbio->bioc->raid_map[0];
+		     rbio->bioc->full_stripe_logical;
 
 	bio_for_each_segment(bvec, bio, iter) {
 		u32 bvec_offset;
@@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 {
 	struct bio *bio;
 
-	spin_lock_irq(&rbio->bio_list_lock);
+	spin_lock(&rbio->bio_list_lock);
 	bio_list_for_each(bio, &rbio->bio_list)
 		index_one_bio(rbio, bio);
 
-	spin_unlock_irq(&rbio->bio_list_lock);
+	spin_unlock(&rbio->bio_list_lock);
 }
 
 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
@@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
 			goto error;
 	}
 
-	if (likely(!rbio->bioc->num_tgtdevs))
+	if (likely(!rbio->bioc->replace_nr_stripes))
 		return 0;
 
-	/* Make a copy for the replace target device. */
+	/*
+	 * Make a copy for the replace target device.
+	 *
+	 * Thus the source stripe number (in replace_stripe_src) should be valid.
+	 */
+	ASSERT(rbio->bioc->replace_stripe_src >= 0);
+
 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
 	     total_sector_nr++) {
 		struct sector_ptr *sector;
@@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
 		stripe = total_sector_nr / rbio->stripe_nsectors;
 		sectornr = total_sector_nr % rbio->stripe_nsectors;
 
-		if (!rbio->bioc->tgtdev_map[stripe]) {
+		/*
+		 * For RAID56, there is only one device that can be replaced,
+		 * and replace_stripe_src[0] indicates the stripe number we
+		 * need to copy from.
+		 */
+		if (stripe != rbio->bioc->replace_stripe_src) {
 			/*
 			 * We can skip the whole stripe completely, note
 			 * total_sector_nr will be increased by one anyway.
@@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
 		}
 
 		ret = rbio_add_io_sector(rbio, bio_list, sector,
-					 rbio->bioc->tgtdev_map[stripe],
+					 rbio->real_stripes,
 					 sectornr, REQ_OP_WRITE);
 		if (ret)
 			goto error;
@@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
-		     rbio->bioc->raid_map[0];
+		     rbio->bioc->full_stripe_logical;
 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
 
 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
@@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
 {
 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
-	const u64 full_stripe_start = rbio->bioc->raid_map[0];
+	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
 	const u32 orig_len = orig_bio->bi_iter.bi_size;
 	const u32 sectorsize = fs_info->sectorsize;
 	u64 cur_logical;
@@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 		 * here due to a crc mismatch and we can't give them the
 		 * data they want.
 		 */
-		if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
-			if (rbio->bioc->raid_map[faila] ==
-			    RAID5_P_STRIPE)
+		if (failb == rbio->real_stripes - 1) {
+			if (faila == rbio->real_stripes - 2)
 				/*
 				 * Only P and Q are corrupted.
 				 * We only care about data stripes recovery,
@@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 			goto pstripe;
 		}
 
-		if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+		if (failb == rbio->real_stripes - 2) {
 			raid6_datap_recov(rbio->real_stripes, sectorsize,
 					  faila, pointers);
 		} else {
@@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
 
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
-		spin_lock_irq(&rbio->bio_list_lock);
+		spin_lock(&rbio->bio_list_lock);
 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
-		spin_unlock_irq(&rbio->bio_list_lock);
+		spin_unlock(&rbio->bio_list_lock);
 	}
 
 	index_rbio_pages(rbio);
@@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
-						       rbio->bioc->raid_map[0]);
-	const u64 start = rbio->bioc->raid_map[0];
+						       rbio->bioc->full_stripe_logical);
+	const u64 start = rbio->bioc->full_stripe_logical;
 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
 			fs_info->sectorsize_bits;
 	int ret;
@@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
 	}
 
 	ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
-					rbio->csum_buf, rbio->csum_bitmap);
+					rbio->csum_buf, rbio->csum_bitmap, false);
 	if (ret < 0)
 		goto error;
 	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
@@ -2124,7 +2128,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
 	 */
 	btrfs_warn_rl(fs_info,
 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
-			rbio->bioc->raid_map[0], ret);
+			rbio->bioc->full_stripe_logical, ret);
 no_csum:
 	kfree(rbio->csum_buf);
 	bitmap_free(rbio->csum_bitmap);
@@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
 	 * bio list any more, anyone else that wants to change this stripe
 	 * needs to do their own rmw.
 	 */
-	spin_lock_irq(&rbio->bio_list_lock);
+	spin_lock(&rbio->bio_list_lock);
 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
-	spin_unlock_irq(&rbio->bio_list_lock);
+	spin_unlock(&rbio->bio_list_lock);
 
 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 
@@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
 	return rbio;
 }
 
-/* Used for both parity scrub and missing. */
-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
-			    unsigned int pgoff, u64 logical)
-{
-	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
-	int stripe_offset;
-	int index;
-
-	ASSERT(logical >= rbio->bioc->raid_map[0]);
-	ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
-				       BTRFS_STRIPE_LEN * rbio->nr_data);
-	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
-	index = stripe_offset / sectorsize;
-	rbio->bio_sectors[index].page = page;
-	rbio->bio_sectors[index].pgoff = pgoff;
-}
-
 /*
  * We just scrub the parity that we have correct data on the same horizontal,
  * so we needn't allocate all pages for all the stripes.
@@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
 	else
 		BUG();
 
-	if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
+	/*
+	 * Replace is running and our P/Q stripe is being replaced, then we
+	 * need to duplicate the final write to replace target.
+	 */
+	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
 		is_replace = 1;
 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
 	}
@@ -2544,13 +2535,18 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
 	if (!is_replace)
 		goto submit_write;
 
+	/*
+	 * Replace is running and our parity stripe needs to be duplicated to
+	 * the target device.  Check we have a valid source stripe number.
+	 */
+	ASSERT(rbio->bioc->replace_stripe_src >= 0);
 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
 		struct sector_ptr *sector;
 
 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
-				       bioc->tgtdev_map[rbio->scrubp],
-				       sectornr, REQ_OP_WRITE);
+					 rbio->real_stripes,
+					 sectornr, REQ_OP_WRITE);
 		if (ret)
 			goto cleanup;
 	}
@@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
 	if (!lock_stripe_add(rbio))
 		start_async_work(rbio, scrub_rbio_work_locked);
 }
-
-/* The following code is used for dev replace of a missing RAID 5/6 device. */
-
-struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
-{
-	struct btrfs_fs_info *fs_info = bioc->fs_info;
-	struct btrfs_raid_bio *rbio;
-
-	rbio = alloc_rbio(fs_info, bioc);
-	if (IS_ERR(rbio))
-		return NULL;
-
-	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
-	bio_list_add(&rbio->bio_list, bio);
-	/*
-	 * This is a special bio which is used to hold the completion handler
-	 * and make the scrub rbio is similar to the other types
-	 */
-	ASSERT(!bio->bi_iter.bi_size);
-
-	set_rbio_range_error(rbio, bio);
-
-	return rbio;
-}
-
-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
-{
-	start_async_work(rbio, recover_rbio_work);
-}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index df0e0abdeb1f..0f7f31c8cb98 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
 	return map->num_stripes - btrfs_nr_parity_stripes(map->type);
 }
 
+static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
+{
+	return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+}
+
 #define RAID5_P_STRIPE ((u64)-2)
 #define RAID6_Q_STRIPE ((u64)-1)
 
@@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 			   int mirror_num);
 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
 
-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
-			    unsigned int pgoff, u64 logical);
-
 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
 				struct btrfs_io_context *bioc,
 				struct btrfs_device *scrub_dev,
 				unsigned long *dbitmap, int stripe_nsectors);
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
 
-struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
-
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c10670915f96..59a06499c647 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1266,7 +1266,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
 		level = btrfs_header_level(parent);
 		ASSERT(level >= lowest_level);
 
-		ret = btrfs_bin_search(parent, &key, &slot);
+		ret = btrfs_bin_search(parent, 0, &key, &slot);
 		if (ret < 0)
 			break;
 		if (ret && slot > 0)
@@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 
 		if (upper->eb && !upper->locked) {
 			if (!lowest) {
-				ret = btrfs_bin_search(upper->eb, key, &slot);
+				ret = btrfs_bin_search(upper->eb, 0, key, &slot);
 				if (ret < 0)
 					goto next;
 				BUG_ON(ret);
@@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 			slot = path->slots[upper->level];
 			btrfs_release_path(path);
 		} else {
-			ret = btrfs_bin_search(upper->eb, key, &slot);
+			ret = btrfs_bin_search(upper->eb, 0, key, &slot);
 			if (ret < 0)
 				goto next;
 			BUG_ON(ret);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3720fd1f593d..dd37cba58022 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -38,18 +38,14 @@
  *  - add a mode to also read unallocated space
  */
 
-struct scrub_block;
 struct scrub_ctx;
 
 /*
- * The following three values only influence the performance.
+ * The following value only influences the performance.
  *
- * The last one configures the number of parallel and outstanding I/O
- * operations. The first one configures an upper limit for the number
- * of (dynamically allocated) pages that are added to a bio.
+ * This determines the batch size for stripe submitted in one go.
  */
-#define SCRUB_SECTORS_PER_BIO	32	/* 128KiB per bio for 4KiB pages */
-#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for 4KiB pages */
+#define SCRUB_STRIPES_PER_SCTX	8	/* That would be 8 64K stripe per-device. */
 
 /*
  * The following value times PAGE_SIZE needs to be large enough to match the
@@ -57,128 +53,124 @@ struct scrub_ctx;
  */
 #define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
 
-#define SCRUB_MAX_PAGES			(DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+/* Represent one sector and its needed info to verify the content. */
+struct scrub_sector_verification {
+	bool is_metadata;
 
-/*
- * Maximum number of mirrors that can be available for all profiles counting
- * the target device of dev-replace as one. During an active device replace
- * procedure, the target device of the copy operation is a mirror for the
- * filesystem data as well that can be used to read data in order to repair
- * read errors on other disks.
- *
- * Current value is derived from RAID1C4 with 4 copies.
- */
-#define BTRFS_MAX_MIRRORS (4 + 1)
+	union {
+		/*
+		 * Csum pointer for data csum verification.  Should point to a
+		 * sector csum inside scrub_stripe::csums.
+		 *
+		 * NULL if this data sector has no csum.
+		 */
+		u8 *csum;
 
-struct scrub_recover {
-	refcount_t		refs;
-	struct btrfs_io_context	*bioc;
-	u64			map_length;
+		/*
+		 * Extra info for metadata verification.  All sectors inside a
+		 * tree block share the same generation.
+		 */
+		u64 generation;
+	};
 };
 
-struct scrub_sector {
-	struct scrub_block	*sblock;
-	struct list_head	list;
-	u64			flags;  /* extent flags */
-	u64			generation;
-	/* Offset in bytes to @sblock. */
-	u32			offset;
-	atomic_t		refs;
-	unsigned int		have_csum:1;
-	unsigned int		io_error:1;
-	u8			csum[BTRFS_CSUM_SIZE];
-
-	struct scrub_recover	*recover;
-};
+enum scrub_stripe_flags {
+	/* Set when @mirror_num, @dev, @physical and @logical are set. */
+	SCRUB_STRIPE_FLAG_INITIALIZED,
 
-struct scrub_bio {
-	int			index;
-	struct scrub_ctx	*sctx;
-	struct btrfs_device	*dev;
-	struct bio		*bio;
-	blk_status_t		status;
-	u64			logical;
-	u64			physical;
-	struct scrub_sector	*sectors[SCRUB_SECTORS_PER_BIO];
-	int			sector_count;
-	int			next_free;
-	struct work_struct	work;
-};
+	/* Set when the read-repair is finished. */
+	SCRUB_STRIPE_FLAG_REPAIR_DONE,
 
-struct scrub_block {
 	/*
-	 * Each page will have its page::private used to record the logical
-	 * bytenr.
+	 * Set for data stripes if it's triggered from P/Q stripe.
+	 * During such scrub, we should not report errors in data stripes, nor
+	 * update the accounting.
 	 */
-	struct page		*pages[SCRUB_MAX_PAGES];
-	struct scrub_sector	*sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
-	struct btrfs_device	*dev;
-	/* Logical bytenr of the sblock */
-	u64			logical;
-	u64			physical;
-	u64			physical_for_dev_replace;
-	/* Length of sblock in bytes */
-	u32			len;
-	int			sector_count;
-	int			mirror_num;
-
-	atomic_t		outstanding_sectors;
-	refcount_t		refs; /* free mem on transition to zero */
-	struct scrub_ctx	*sctx;
-	struct scrub_parity	*sparity;
-	struct {
-		unsigned int	header_error:1;
-		unsigned int	checksum_error:1;
-		unsigned int	no_io_error_seen:1;
-		unsigned int	generation_error:1; /* also sets header_error */
-
-		/* The following is for the data used to check parity */
-		/* It is for the data with checksum */
-		unsigned int	data_corrected:1;
-	};
-	struct work_struct	work;
+	SCRUB_STRIPE_FLAG_NO_REPORT,
 };
 
-/* Used for the chunks with parity stripe such RAID5/6 */
-struct scrub_parity {
-	struct scrub_ctx	*sctx;
+#define SCRUB_STRIPE_PAGES		(BTRFS_STRIPE_LEN / PAGE_SIZE)
+
+/*
+ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
+ */
+struct scrub_stripe {
+	struct scrub_ctx *sctx;
+	struct btrfs_block_group *bg;
+
+	struct page *pages[SCRUB_STRIPE_PAGES];
+	struct scrub_sector_verification *sectors;
+
+	struct btrfs_device *dev;
+	u64 logical;
+	u64 physical;
 
-	struct btrfs_device	*scrub_dev;
+	u16 mirror_num;
 
-	u64			logic_start;
+	/* Should be BTRFS_STRIPE_LEN / sectorsize. */
+	u16 nr_sectors;
+
+	/*
+	 * How many data/meta extents are in this stripe.  Only for scrub status
+	 * reporting purposes.
+	 */
+	u16 nr_data_extents;
+	u16 nr_meta_extents;
 
-	u64			logic_end;
+	atomic_t pending_io;
+	wait_queue_head_t io_wait;
+	wait_queue_head_t repair_wait;
 
-	int			nsectors;
+	/*
+	 * Indicate the states of the stripe.  Bits are defined in
+	 * scrub_stripe_flags enum.
+	 */
+	unsigned long state;
 
-	u32			stripe_len;
+	/* Indicate which sectors are covered by extent items. */
+	unsigned long extent_sector_bitmap;
 
-	refcount_t		refs;
+	/*
+	 * The errors hit during the initial read of the stripe.
+	 *
+	 * Would be utilized for error reporting and repair.
+	 */
+	unsigned long init_error_bitmap;
 
-	struct list_head	sectors_list;
+	/*
+	 * The following error bitmaps are all for the current status.
+	 * Every time we submit a new read, these bitmaps may be updated.
+	 *
+	 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+	 *
+	 * IO and csum errors can happen for both metadata and data.
+	 */
+	unsigned long error_bitmap;
+	unsigned long io_error_bitmap;
+	unsigned long csum_error_bitmap;
+	unsigned long meta_error_bitmap;
 
-	/* Work of parity check and repair */
-	struct work_struct	work;
+	/* For writeback (repair or replace) error reporting. */
+	unsigned long write_error_bitmap;
 
-	/* Mark the parity blocks which have data */
-	unsigned long		dbitmap;
+	/* Writeback can be concurrent, thus we need to protect the bitmap. */
+	spinlock_t write_error_lock;
 
 	/*
-	 * Mark the parity blocks which have data, but errors happen when
-	 * read data or check data
+	 * Checksum for the whole stripe if this stripe is inside a data block
+	 * group.
 	 */
-	unsigned long		ebitmap;
+	u8 *csums;
+
+	struct work_struct work;
 };
 
 struct scrub_ctx {
-	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
+	struct scrub_stripe	stripes[SCRUB_STRIPES_PER_SCTX];
+	struct scrub_stripe	*raid56_data_stripes;
 	struct btrfs_fs_info	*fs_info;
 	int			first_free;
-	int			curr;
-	atomic_t		bios_in_flight;
-	atomic_t		workers_pending;
-	spinlock_t		list_lock;
-	wait_queue_head_t	list_wait;
+	int			cur_stripe;
 	struct list_head	csum_list;
 	atomic_t		cancel_req;
 	int			readonly;
@@ -191,10 +183,8 @@ struct scrub_ctx {
 	int			is_dev_replace;
 	u64			write_pointer;
 
-	struct scrub_bio        *wr_curr_bio;
 	struct mutex            wr_lock;
 	struct btrfs_device     *wr_tgtdev;
-	bool                    flush_all_writes;
 
 	/*
 	 * statistics
@@ -221,239 +211,66 @@ struct scrub_warning {
 	struct btrfs_device	*dev;
 };
 
-struct full_stripe_lock {
-	struct rb_node node;
-	u64 logical;
-	u64 refs;
-	struct mutex mutex;
-};
-
-#ifndef CONFIG_64BIT
-/* This structure is for architectures whose (void *) is smaller than u64 */
-struct scrub_page_private {
-	u64 logical;
-};
-#endif
-
-static int attach_scrub_page_private(struct page *page, u64 logical)
-{
-#ifdef CONFIG_64BIT
-	attach_page_private(page, (void *)logical);
-	return 0;
-#else
-	struct scrub_page_private *spp;
-
-	spp = kmalloc(sizeof(*spp), GFP_KERNEL);
-	if (!spp)
-		return -ENOMEM;
-	spp->logical = logical;
-	attach_page_private(page, (void *)spp);
-	return 0;
-#endif
-}
-
-static void detach_scrub_page_private(struct page *page)
-{
-#ifdef CONFIG_64BIT
-	detach_page_private(page);
-	return;
-#else
-	struct scrub_page_private *spp;
-
-	spp = detach_page_private(page);
-	kfree(spp);
-	return;
-#endif
-}
-
-static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
-					     struct btrfs_device *dev,
-					     u64 logical, u64 physical,
-					     u64 physical_for_dev_replace,
-					     int mirror_num)
-{
-	struct scrub_block *sblock;
-
-	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
-	if (!sblock)
-		return NULL;
-	refcount_set(&sblock->refs, 1);
-	sblock->sctx = sctx;
-	sblock->logical = logical;
-	sblock->physical = physical;
-	sblock->physical_for_dev_replace = physical_for_dev_replace;
-	sblock->dev = dev;
-	sblock->mirror_num = mirror_num;
-	sblock->no_io_error_seen = 1;
-	/*
-	 * Scrub_block::pages will be allocated at alloc_scrub_sector() when
-	 * the corresponding page is not allocated.
-	 */
-	return sblock;
-}
-
-/*
- * Allocate a new scrub sector and attach it to @sblock.
- *
- * Will also allocate new pages for @sblock if needed.
- */
-static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
-					       u64 logical)
+static void release_scrub_stripe(struct scrub_stripe *stripe)
 {
-	const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
-	struct scrub_sector *ssector;
-
-	/* We must never have scrub_block exceed U32_MAX in size. */
-	ASSERT(logical - sblock->logical < U32_MAX);
-
-	ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
-	if (!ssector)
-		return NULL;
-
-	/* Allocate a new page if the slot is not allocated */
-	if (!sblock->pages[page_index]) {
-		int ret;
+	if (!stripe)
+		return;
 
-		sblock->pages[page_index] = alloc_page(GFP_KERNEL);
-		if (!sblock->pages[page_index]) {
-			kfree(ssector);
-			return NULL;
-		}
-		ret = attach_scrub_page_private(sblock->pages[page_index],
-				sblock->logical + (page_index << PAGE_SHIFT));
-		if (ret < 0) {
-			kfree(ssector);
-			__free_page(sblock->pages[page_index]);
-			sblock->pages[page_index] = NULL;
-			return NULL;
-		}
+	for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
+		if (stripe->pages[i])
+			__free_page(stripe->pages[i]);
+		stripe->pages[i] = NULL;
 	}
-
-	atomic_set(&ssector->refs, 1);
-	ssector->sblock = sblock;
-	/* The sector to be added should not be used */
-	ASSERT(sblock->sectors[sblock->sector_count] == NULL);
-	ssector->offset = logical - sblock->logical;
-
-	/* The sector count must be smaller than the limit */
-	ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
-
-	sblock->sectors[sblock->sector_count] = ssector;
-	sblock->sector_count++;
-	sblock->len += sblock->sctx->fs_info->sectorsize;
-
-	return ssector;
-}
-
-static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
-{
-	struct scrub_block *sblock = ssector->sblock;
-	pgoff_t index;
-	/*
-	 * When calling this function, ssector must be alreaday attached to the
-	 * parent sblock.
-	 */
-	ASSERT(sblock);
-
-	/* The range should be inside the sblock range */
-	ASSERT(ssector->offset < sblock->len);
-
-	index = ssector->offset >> PAGE_SHIFT;
-	ASSERT(index < SCRUB_MAX_PAGES);
-	ASSERT(sblock->pages[index]);
-	ASSERT(PagePrivate(sblock->pages[index]));
-	return sblock->pages[index];
+	kfree(stripe->sectors);
+	kfree(stripe->csums);
+	stripe->sectors = NULL;
+	stripe->csums = NULL;
+	stripe->sctx = NULL;
+	stripe->state = 0;
 }
 
-static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
+			     struct scrub_stripe *stripe)
 {
-	struct scrub_block *sblock = ssector->sblock;
+	int ret;
 
-	/*
-	 * When calling this function, ssector must be already attached to the
-	 * parent sblock.
-	 */
-	ASSERT(sblock);
+	memset(stripe, 0, sizeof(*stripe));
 
-	/* The range should be inside the sblock range */
-	ASSERT(ssector->offset < sblock->len);
+	stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+	stripe->state = 0;
 
-	return offset_in_page(ssector->offset);
-}
+	init_waitqueue_head(&stripe->io_wait);
+	init_waitqueue_head(&stripe->repair_wait);
+	atomic_set(&stripe->pending_io, 0);
+	spin_lock_init(&stripe->write_error_lock);
 
-static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
-{
-	return page_address(scrub_sector_get_page(ssector)) +
-	       scrub_sector_get_page_offset(ssector);
+	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+	if (ret < 0)
+		goto error;
+
+	stripe->sectors = kcalloc(stripe->nr_sectors,
+				  sizeof(struct scrub_sector_verification),
+				  GFP_KERNEL);
+	if (!stripe->sectors)
+		goto error;
+
+	stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
+				fs_info->csum_size, GFP_KERNEL);
+	if (!stripe->csums)
+		goto error;
+	return 0;
+error:
+	release_scrub_stripe(stripe);
+	return -ENOMEM;
 }
 
-static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
-				unsigned int len)
+static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
 {
-	return bio_add_page(bio, scrub_sector_get_page(ssector), len,
-			    scrub_sector_get_page_offset(ssector));
+	wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
 }
 
-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
-				     struct scrub_block *sblocks_for_recheck[]);
-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-				struct scrub_block *sblock,
-				int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct scrub_block *sblock);
-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-					     struct scrub_block *sblock_good);
-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
-					    struct scrub_block *sblock_good,
-					    int sector_num, int force_write);
-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
-					     int sector_num);
-static int scrub_checksum_data(struct scrub_block *sblock);
-static int scrub_checksum_tree_block(struct scrub_block *sblock);
-static int scrub_checksum_super(struct scrub_block *sblock);
-static void scrub_block_put(struct scrub_block *sblock);
-static void scrub_sector_get(struct scrub_sector *sector);
-static void scrub_sector_put(struct scrub_sector *sector);
-static void scrub_parity_get(struct scrub_parity *sparity);
-static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
-			 u64 physical, struct btrfs_device *dev, u64 flags,
-			 u64 gen, int mirror_num, u8 *csum,
-			 u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio);
-static void scrub_bio_end_io_worker(struct work_struct *work);
-static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
-				 u64 extent_logical, u32 extent_len,
-				 u64 *extent_physical,
-				 struct btrfs_device **extent_dev,
-				 int *extent_mirror_num);
-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
-				      struct scrub_sector *sector);
-static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio);
-static void scrub_wr_bio_end_io_worker(struct work_struct *work);
 static void scrub_put_ctx(struct scrub_ctx *sctx);
 
-static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
-{
-	return sector->recover &&
-	       (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
-}
-
-static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
-{
-	refcount_inc(&sctx->refs);
-	atomic_inc(&sctx->bios_in_flight);
-}
-
-static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
-{
-	atomic_dec(&sctx->bios_in_flight);
-	wake_up(&sctx->list_wait);
-	scrub_put_ctx(sctx);
-}
-
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 {
 	while (atomic_read(&fs_info->scrub_pause_req)) {
@@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 	scrub_pause_off(fs_info);
 }
 
-/*
- * Insert new full stripe lock into full stripe locks tree
- *
- * Return pointer to existing or newly inserted full_stripe_lock structure if
- * everything works well.
- * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
- *
- * NOTE: caller must hold full_stripe_locks_root->lock before calling this
- * function
- */
-static struct full_stripe_lock *insert_full_stripe_lock(
-		struct btrfs_full_stripe_locks_tree *locks_root,
-		u64 fstripe_logical)
-{
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	struct full_stripe_lock *entry;
-	struct full_stripe_lock *ret;
-
-	lockdep_assert_held(&locks_root->lock);
-
-	p = &locks_root->root.rb_node;
-	while (*p) {
-		parent = *p;
-		entry = rb_entry(parent, struct full_stripe_lock, node);
-		if (fstripe_logical < entry->logical) {
-			p = &(*p)->rb_left;
-		} else if (fstripe_logical > entry->logical) {
-			p = &(*p)->rb_right;
-		} else {
-			entry->refs++;
-			return entry;
-		}
-	}
-
-	/*
-	 * Insert new lock.
-	 */
-	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
-	if (!ret)
-		return ERR_PTR(-ENOMEM);
-	ret->logical = fstripe_logical;
-	ret->refs = 1;
-	mutex_init(&ret->mutex);
-
-	rb_link_node(&ret->node, parent, p);
-	rb_insert_color(&ret->node, &locks_root->root);
-	return ret;
-}
-
-/*
- * Search for a full stripe lock of a block group
- *
- * Return pointer to existing full stripe lock if found
- * Return NULL if not found
- */
-static struct full_stripe_lock *search_full_stripe_lock(
-		struct btrfs_full_stripe_locks_tree *locks_root,
-		u64 fstripe_logical)
-{
-	struct rb_node *node;
-	struct full_stripe_lock *entry;
-
-	lockdep_assert_held(&locks_root->lock);
-
-	node = locks_root->root.rb_node;
-	while (node) {
-		entry = rb_entry(node, struct full_stripe_lock, node);
-		if (fstripe_logical < entry->logical)
-			node = node->rb_left;
-		else if (fstripe_logical > entry->logical)
-			node = node->rb_right;
-		else
-			return entry;
-	}
-	return NULL;
-}
-
-/*
- * Helper to get full stripe logical from a normal bytenr.
- *
- * Caller must ensure @cache is a RAID56 block group.
- */
-static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
-{
-	u64 ret;
-
-	/*
-	 * Due to chunk item size limit, full stripe length should not be
-	 * larger than U32_MAX. Just a sanity check here.
-	 */
-	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
-
-	/*
-	 * round_down() can only handle power of 2, while RAID56 full
-	 * stripe length can be 64KiB * n, so we need to manually round down.
-	 */
-	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
-			cache->full_stripe_len + cache->start;
-	return ret;
-}
-
-/*
- * Lock a full stripe to avoid concurrency of recovery and read
- *
- * It's only used for profiles with parities (RAID5/6), for other profiles it
- * does nothing.
- *
- * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
- * So caller must call unlock_full_stripe() at the same context.
- *
- * Return <0 if encounters error.
- */
-static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
-			    bool *locked_ret)
-{
-	struct btrfs_block_group *bg_cache;
-	struct btrfs_full_stripe_locks_tree *locks_root;
-	struct full_stripe_lock *existing;
-	u64 fstripe_start;
-	int ret = 0;
-
-	*locked_ret = false;
-	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
-	if (!bg_cache) {
-		ASSERT(0);
-		return -ENOENT;
-	}
-
-	/* Profiles not based on parity don't need full stripe lock */
-	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
-		goto out;
-	locks_root = &bg_cache->full_stripe_locks_root;
-
-	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
-
-	/* Now insert the full stripe lock */
-	mutex_lock(&locks_root->lock);
-	existing = insert_full_stripe_lock(locks_root, fstripe_start);
-	mutex_unlock(&locks_root->lock);
-	if (IS_ERR(existing)) {
-		ret = PTR_ERR(existing);
-		goto out;
-	}
-	mutex_lock(&existing->mutex);
-	*locked_ret = true;
-out:
-	btrfs_put_block_group(bg_cache);
-	return ret;
-}
-
-/*
- * Unlock a full stripe.
- *
- * NOTE: Caller must ensure it's the same context calling corresponding
- * lock_full_stripe().
- *
- * Return 0 if we unlock full stripe without problem.
- * Return <0 for error
- */
-static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
-			      bool locked)
-{
-	struct btrfs_block_group *bg_cache;
-	struct btrfs_full_stripe_locks_tree *locks_root;
-	struct full_stripe_lock *fstripe_lock;
-	u64 fstripe_start;
-	bool freeit = false;
-	int ret = 0;
-
-	/* If we didn't acquire full stripe lock, no need to continue */
-	if (!locked)
-		return 0;
-
-	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
-	if (!bg_cache) {
-		ASSERT(0);
-		return -ENOENT;
-	}
-	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
-		goto out;
-
-	locks_root = &bg_cache->full_stripe_locks_root;
-	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
-
-	mutex_lock(&locks_root->lock);
-	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
-	/* Unpaired unlock_full_stripe() detected */
-	if (!fstripe_lock) {
-		WARN_ON(1);
-		ret = -ENOENT;
-		mutex_unlock(&locks_root->lock);
-		goto out;
-	}
-
-	if (fstripe_lock->refs == 0) {
-		WARN_ON(1);
-		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
-			fstripe_lock->logical);
-	} else {
-		fstripe_lock->refs--;
-	}
-
-	if (fstripe_lock->refs == 0) {
-		rb_erase(&fstripe_lock->node, &locks_root->root);
-		freeit = true;
-	}
-	mutex_unlock(&locks_root->lock);
-
-	mutex_unlock(&fstripe_lock->mutex);
-	if (freeit)
-		kfree(fstripe_lock);
-out:
-	btrfs_put_block_group(bg_cache);
-	return ret;
-}
-
 static void scrub_free_csums(struct scrub_ctx *sctx)
 {
 	while (!list_empty(&sctx->csum_list)) {
@@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 	if (!sctx)
 		return;
 
-	/* this can happen when scrub is cancelled */
-	if (sctx->curr != -1) {
-		struct scrub_bio *sbio = sctx->bios[sctx->curr];
-
-		for (i = 0; i < sbio->sector_count; i++)
-			scrub_block_put(sbio->sectors[i]->sblock);
-		bio_put(sbio->bio);
-	}
-
-	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
-		struct scrub_bio *sbio = sctx->bios[i];
-
-		if (!sbio)
-			break;
-		kfree(sbio);
-	}
+	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
+		release_scrub_stripe(&sctx->stripes[i]);
 
-	kfree(sctx->wr_curr_bio);
 	scrub_free_csums(sctx);
 	kfree(sctx);
 }
@@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 		goto nomem;
 	refcount_set(&sctx->refs, 1);
 	sctx->is_dev_replace = is_dev_replace;
-	sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
-	sctx->curr = -1;
 	sctx->fs_info = fs_info;
 	INIT_LIST_HEAD(&sctx->csum_list);
-	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
-		struct scrub_bio *sbio;
+	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
+		int ret;
 
-		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
-		if (!sbio)
+		ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
+		if (ret < 0)
 			goto nomem;
-		sctx->bios[i] = sbio;
-
-		sbio->index = i;
-		sbio->sctx = sctx;
-		sbio->sector_count = 0;
-		INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
-
-		if (i != SCRUB_BIOS_PER_SCTX - 1)
-			sctx->bios[i]->next_free = i + 1;
-		else
-			sctx->bios[i]->next_free = -1;
+		sctx->stripes[i].sctx = sctx;
 	}
 	sctx->first_free = 0;
-	atomic_set(&sctx->bios_in_flight, 0);
-	atomic_set(&sctx->workers_pending, 0);
 	atomic_set(&sctx->cancel_req, 0);
 
-	spin_lock_init(&sctx->list_lock);
 	spin_lock_init(&sctx->stat_lock);
-	init_waitqueue_head(&sctx->list_wait);
 	sctx->throttle_deadline = 0;
 
-	WARN_ON(sctx->wr_curr_bio != NULL);
 	mutex_init(&sctx->wr_lock);
-	sctx->wr_curr_bio = NULL;
 	if (is_dev_replace) {
 		WARN_ON(!fs_info->dev_replace.tgtdev);
 		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
-		sctx->flush_all_writes = false;
 	}
 
 	return sctx;
@@ -898,10 +464,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 	return 0;
 }
 
-static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
+				       bool is_super, u64 logical, u64 physical)
 {
-	struct btrfs_device *dev;
-	struct btrfs_fs_info *fs_info;
+	struct btrfs_fs_info *fs_info = dev->fs_info;
 	struct btrfs_path *path;
 	struct btrfs_key found_key;
 	struct extent_buffer *eb;
@@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	u8 ref_level = 0;
 	int ret;
 
-	WARN_ON(sblock->sector_count < 1);
-	dev = sblock->dev;
-	fs_info = sblock->sctx->fs_info;
-
 	/* Super block error, no need to search extent tree. */
-	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+	if (is_super) {
 		btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
-			errstr, btrfs_dev_name(dev), sblock->physical);
+				  errstr, btrfs_dev_name(dev), physical);
 		return;
 	}
 	path = btrfs_alloc_path();
 	if (!path)
 		return;
 
-	swarn.physical = sblock->physical;
-	swarn.logical = sblock->logical;
+	swarn.physical = physical;
+	swarn.logical = logical;
 	swarn.errstr = errstr;
 	swarn.dev = NULL;
 
@@ -978,1921 +540,717 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	btrfs_free_path(path);
 }
 
-static inline void scrub_get_recover(struct scrub_recover *recover)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
 {
-	refcount_inc(&recover->refs);
+	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
+		return 2;
+	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
+		return 3;
+	else
+		return (int)bioc->num_stripes;
 }
 
-static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
-				     struct scrub_recover *recover)
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+						 u64 full_stripe_logical,
+						 int nstripes, int mirror,
+						 int *stripe_index,
+						 u64 *stripe_offset)
 {
-	if (refcount_dec_and_test(&recover->refs)) {
-		btrfs_bio_counter_dec(fs_info);
-		btrfs_put_bioc(recover->bioc);
-		kfree(recover);
+	int i;
+
+	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
+					    nstripes - 1 : nstripes - 2;
+
+		/* RAID5/6 */
+		for (i = 0; i < nr_data_stripes; i++) {
+			const u64 data_stripe_start = full_stripe_logical +
+						(i * BTRFS_STRIPE_LEN);
+
+			if (logical >= data_stripe_start &&
+			    logical < data_stripe_start + BTRFS_STRIPE_LEN)
+				break;
+		}
+
+		*stripe_index = i;
+		*stripe_offset = (logical - full_stripe_logical) &
+				 BTRFS_STRIPE_LEN_MASK;
+	} else {
+		/* The other RAID type */
+		*stripe_index = mirror;
+		*stripe_offset = 0;
 	}
 }
 
-/*
- * scrub_handle_errored_block gets called when either verification of the
- * sectors failed or the bio failed to read, e.g. with EIO. In the latter
- * case, this function handles all sectors in the bio, even though only one
- * may be bad.
- * The goal of this function is to repair the errored block by using the
- * contents of one of the mirrors.
- */
-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
+static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
 {
-	struct scrub_ctx *sctx = sblock_to_check->sctx;
-	struct btrfs_device *dev = sblock_to_check->dev;
-	struct btrfs_fs_info *fs_info;
-	u64 logical;
-	unsigned int failed_mirror_index;
-	unsigned int is_metadata;
-	unsigned int have_csum;
-	/* One scrub_block for each mirror */
-	struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
-	struct scrub_block *sblock_bad;
-	int ret;
-	int mirror_index;
-	int sector_num;
-	int success;
-	bool full_stripe_locked;
-	unsigned int nofs_flag;
-	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
-				      DEFAULT_RATELIMIT_BURST);
+	int ret = 0;
+	u64 length;
 
-	BUG_ON(sblock_to_check->sector_count < 1);
-	fs_info = sctx->fs_info;
-	if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
-		/*
-		 * If we find an error in a super block, we just report it.
-		 * They will get written with the next transaction commit
-		 * anyway
-		 */
-		scrub_print_warning("super block error", sblock_to_check);
-		spin_lock(&sctx->stat_lock);
-		++sctx->stat.super_errors;
-		spin_unlock(&sctx->stat_lock);
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+	if (!btrfs_is_zoned(sctx->fs_info))
 		return 0;
-	}
-	logical = sblock_to_check->logical;
-	ASSERT(sblock_to_check->mirror_num);
-	failed_mirror_index = sblock_to_check->mirror_num - 1;
-	is_metadata = !(sblock_to_check->sectors[0]->flags &
-			BTRFS_EXTENT_FLAG_DATA);
-	have_csum = sblock_to_check->sectors[0]->have_csum;
-
-	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
+
+	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
 		return 0;
 
-	/*
-	 * We must use GFP_NOFS because the scrub task might be waiting for a
-	 * worker task executing this function and in turn a transaction commit
-	 * might be waiting the scrub task to pause (which needs to wait for all
-	 * the worker tasks to complete before pausing).
-	 * We do allocations in the workers through insert_full_stripe_lock()
-	 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
-	 * this function.
-	 */
-	nofs_flag = memalloc_nofs_save();
-	/*
-	 * For RAID5/6, race can happen for a different device scrub thread.
-	 * For data corruption, Parity and Data threads will both try
-	 * to recovery the data.
-	 * Race can lead to doubly added csum error, or even unrecoverable
-	 * error.
-	 */
-	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
-	if (ret < 0) {
-		memalloc_nofs_restore(nofs_flag);
-		spin_lock(&sctx->stat_lock);
-		if (ret == -ENOMEM)
-			sctx->stat.malloc_errors++;
-		sctx->stat.read_errors++;
-		sctx->stat.uncorrectable_errors++;
-		spin_unlock(&sctx->stat_lock);
-		return ret;
+	if (sctx->write_pointer < physical) {
+		length = physical - sctx->write_pointer;
+
+		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
+						sctx->write_pointer, length);
+		if (!ret)
+			sctx->write_pointer = physical;
 	}
+	return ret;
+}
 
-	/*
-	 * read all mirrors one after the other. This includes to
-	 * re-read the extent or metadata block that failed (that was
-	 * the cause that this fixup code is called) another time,
-	 * sector by sector this time in order to know which sectors
-	 * caused I/O errors and which ones are good (for all mirrors).
-	 * It is the goal to handle the situation when more than one
-	 * mirror contains I/O errors, but the errors do not
-	 * overlap, i.e. the data can be repaired by selecting the
-	 * sectors from those mirrors without I/O error on the
-	 * particular sectors. One example (with blocks >= 2 * sectorsize)
-	 * would be that mirror #1 has an I/O error on the first sector,
-	 * the second sector is good, and mirror #2 has an I/O error on
-	 * the second sector, but the first sector is good.
-	 * Then the first sector of the first mirror can be repaired by
-	 * taking the first sector of the second mirror, and the
-	 * second sector of the second mirror can be repaired by
-	 * copying the contents of the 2nd sector of the 1st mirror.
-	 * One more note: if the sectors of one mirror contain I/O
-	 * errors, the checksum cannot be verified. In order to get
-	 * the best data for repairing, the first attempt is to find
-	 * a mirror without I/O errors and with a validated checksum.
-	 * Only if this is not possible, the sectors are picked from
-	 * mirrors with I/O errors without considering the checksum.
-	 * If the latter is the case, at the end, the checksum of the
-	 * repaired area is verified in order to correctly maintain
-	 * the statistics.
-	 */
-	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
-		/*
-		 * Note: the two members refs and outstanding_sectors are not
-		 * used in the blocks that are used for the recheck procedure.
-		 *
-		 * But alloc_scrub_block() will initialize sblock::ref anyway,
-		 * so we can use scrub_block_put() to clean them up.
-		 *
-		 * And here we don't setup the physical/dev for the sblock yet,
-		 * they will be correctly initialized in scrub_setup_recheck_block().
-		 */
-		sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
-							logical, 0, 0, mirror_index);
-		if (!sblocks_for_recheck[mirror_index]) {
-			spin_lock(&sctx->stat_lock);
-			sctx->stat.malloc_errors++;
-			sctx->stat.read_errors++;
-			sctx->stat.uncorrectable_errors++;
-			spin_unlock(&sctx->stat_lock);
-			btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-			goto out;
-		}
-	}
-
-	/* Setup the context, map the logical blocks and alloc the sectors */
-	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
-	if (ret) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.read_errors++;
-		sctx->stat.uncorrectable_errors++;
-		spin_unlock(&sctx->stat_lock);
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-		goto out;
-	}
-	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
-	sblock_bad = sblocks_for_recheck[failed_mirror_index];
-
-	/* build and submit the bios for the failed mirror, check checksums */
-	scrub_recheck_block(fs_info, sblock_bad, 1);
-
-	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
-	    sblock_bad->no_io_error_seen) {
-		/*
-		 * The error disappeared after reading sector by sector, or
-		 * the area was part of a huge bio and other parts of the
-		 * bio caused I/O errors, or the block layer merged several
-		 * read requests into one and the error is caused by a
-		 * different bio (usually one of the two latter cases is
-		 * the cause)
-		 */
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.unverified_errors++;
-		sblock_to_check->data_corrected = 1;
-		spin_unlock(&sctx->stat_lock);
-
-		if (sctx->is_dev_replace)
-			scrub_write_block_to_dev_replace(sblock_bad);
-		goto out;
-	}
-
-	if (!sblock_bad->no_io_error_seen) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.read_errors++;
-		spin_unlock(&sctx->stat_lock);
-		if (__ratelimit(&rs))
-			scrub_print_warning("i/o error", sblock_to_check);
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-	} else if (sblock_bad->checksum_error) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.csum_errors++;
-		spin_unlock(&sctx->stat_lock);
-		if (__ratelimit(&rs))
-			scrub_print_warning("checksum error", sblock_to_check);
-		btrfs_dev_stat_inc_and_print(dev,
-					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
-	} else if (sblock_bad->header_error) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.verify_errors++;
-		spin_unlock(&sctx->stat_lock);
-		if (__ratelimit(&rs))
-			scrub_print_warning("checksum/header error",
-					    sblock_to_check);
-		if (sblock_bad->generation_error)
-			btrfs_dev_stat_inc_and_print(dev,
-				BTRFS_DEV_STAT_GENERATION_ERRS);
-		else
-			btrfs_dev_stat_inc_and_print(dev,
-				BTRFS_DEV_STAT_CORRUPTION_ERRS);
-	}
-
-	if (sctx->readonly) {
-		ASSERT(!sctx->is_dev_replace);
-		goto out;
-	}
-
-	/*
-	 * now build and submit the bios for the other mirrors, check
-	 * checksums.
-	 * First try to pick the mirror which is completely without I/O
-	 * errors and also does not have a checksum error.
-	 * If one is found, and if a checksum is present, the full block
-	 * that is known to contain an error is rewritten. Afterwards
-	 * the block is known to be corrected.
-	 * If a mirror is found which is completely correct, and no
-	 * checksum is present, only those sectors are rewritten that had
-	 * an I/O error in the block to be repaired, since it cannot be
-	 * determined, which copy of the other sectors is better (and it
-	 * could happen otherwise that a correct sector would be
-	 * overwritten by a bad one).
-	 */
-	for (mirror_index = 0; ;mirror_index++) {
-		struct scrub_block *sblock_other;
-
-		if (mirror_index == failed_mirror_index)
-			continue;
-
-		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
-		if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
-			if (mirror_index >= BTRFS_MAX_MIRRORS)
-				break;
-			if (!sblocks_for_recheck[mirror_index]->sector_count)
-				break;
-
-			sblock_other = sblocks_for_recheck[mirror_index];
-		} else {
-			struct scrub_recover *r = sblock_bad->sectors[0]->recover;
-			int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
-
-			if (mirror_index >= max_allowed)
-				break;
-			if (!sblocks_for_recheck[1]->sector_count)
-				break;
-
-			ASSERT(failed_mirror_index == 0);
-			sblock_other = sblocks_for_recheck[1];
-			sblock_other->mirror_num = 1 + mirror_index;
-		}
-
-		/* build and submit the bios, check checksums */
-		scrub_recheck_block(fs_info, sblock_other, 0);
-
-		if (!sblock_other->header_error &&
-		    !sblock_other->checksum_error &&
-		    sblock_other->no_io_error_seen) {
-			if (sctx->is_dev_replace) {
-				scrub_write_block_to_dev_replace(sblock_other);
-				goto corrected_error;
-			} else {
-				ret = scrub_repair_block_from_good_copy(
-						sblock_bad, sblock_other);
-				if (!ret)
-					goto corrected_error;
-			}
-		}
-	}
-
-	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
-		goto did_not_correct_error;
-
-	/*
-	 * In case of I/O errors in the area that is supposed to be
-	 * repaired, continue by picking good copies of those sectors.
-	 * Select the good sectors from mirrors to rewrite bad sectors from
-	 * the area to fix. Afterwards verify the checksum of the block
-	 * that is supposed to be repaired. This verification step is
-	 * only done for the purpose of statistic counting and for the
-	 * final scrub report, whether errors remain.
-	 * A perfect algorithm could make use of the checksum and try
-	 * all possible combinations of sectors from the different mirrors
-	 * until the checksum verification succeeds. For example, when
-	 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
-	 * of mirror #2 is readable but the final checksum test fails,
-	 * then the 2nd sector of mirror #3 could be tried, whether now
-	 * the final checksum succeeds. But this would be a rare
-	 * exception and is therefore not implemented. At least it is
-	 * avoided that the good copy is overwritten.
-	 * A more useful improvement would be to pick the sectors
-	 * without I/O error based on sector sizes (512 bytes on legacy
-	 * disks) instead of on sectorsize. Then maybe 512 byte of one
-	 * mirror could be repaired by taking 512 byte of a different
-	 * mirror, even if other 512 byte sectors in the same sectorsize
-	 * area are unreadable.
-	 */
-	success = 1;
-	for (sector_num = 0; sector_num < sblock_bad->sector_count;
-	     sector_num++) {
-		struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
-		struct scrub_block *sblock_other = NULL;
-
-		/* Skip no-io-error sectors in scrub */
-		if (!sector_bad->io_error && !sctx->is_dev_replace)
-			continue;
-
-		if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
-			/*
-			 * In case of dev replace, if raid56 rebuild process
-			 * didn't work out correct data, then copy the content
-			 * in sblock_bad to make sure target device is identical
-			 * to source device, instead of writing garbage data in
-			 * sblock_for_recheck array to target device.
-			 */
-			sblock_other = NULL;
-		} else if (sector_bad->io_error) {
-			/* Try to find no-io-error sector in mirrors */
-			for (mirror_index = 0;
-			     mirror_index < BTRFS_MAX_MIRRORS &&
-			     sblocks_for_recheck[mirror_index]->sector_count > 0;
-			     mirror_index++) {
-				if (!sblocks_for_recheck[mirror_index]->
-				    sectors[sector_num]->io_error) {
-					sblock_other = sblocks_for_recheck[mirror_index];
-					break;
-				}
-			}
-			if (!sblock_other)
-				success = 0;
-		}
-
-		if (sctx->is_dev_replace) {
-			/*
-			 * Did not find a mirror to fetch the sector from.
-			 * scrub_write_sector_to_dev_replace() handles this
-			 * case (sector->io_error), by filling the block with
-			 * zeros before submitting the write request
-			 */
-			if (!sblock_other)
-				sblock_other = sblock_bad;
-
-			if (scrub_write_sector_to_dev_replace(sblock_other,
-							      sector_num) != 0) {
-				atomic64_inc(
-					&fs_info->dev_replace.num_write_errors);
-				success = 0;
-			}
-		} else if (sblock_other) {
-			ret = scrub_repair_sector_from_good_copy(sblock_bad,
-								 sblock_other,
-								 sector_num, 0);
-			if (0 == ret)
-				sector_bad->io_error = 0;
-			else
-				success = 0;
-		}
-	}
-
-	if (success && !sctx->is_dev_replace) {
-		if (is_metadata || have_csum) {
-			/*
-			 * need to verify the checksum now that all
-			 * sectors on disk are repaired (the write
-			 * request for data to be repaired is on its way).
-			 * Just be lazy and use scrub_recheck_block()
-			 * which re-reads the data before the checksum
-			 * is verified, but most likely the data comes out
-			 * of the page cache.
-			 */
-			scrub_recheck_block(fs_info, sblock_bad, 1);
-			if (!sblock_bad->header_error &&
-			    !sblock_bad->checksum_error &&
-			    sblock_bad->no_io_error_seen)
-				goto corrected_error;
-			else
-				goto did_not_correct_error;
-		} else {
-corrected_error:
-			spin_lock(&sctx->stat_lock);
-			sctx->stat.corrected_errors++;
-			sblock_to_check->data_corrected = 1;
-			spin_unlock(&sctx->stat_lock);
-			btrfs_err_rl_in_rcu(fs_info,
-				"fixed up error at logical %llu on dev %s",
-				logical, btrfs_dev_name(dev));
-		}
-	} else {
-did_not_correct_error:
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.uncorrectable_errors++;
-		spin_unlock(&sctx->stat_lock);
-		btrfs_err_rl_in_rcu(fs_info,
-			"unable to fixup (regular) error at logical %llu on dev %s",
-			logical, btrfs_dev_name(dev));
-	}
-
-out:
-	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
-		struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
-		struct scrub_recover *recover;
-		int sector_index;
-
-		/* Not allocated, continue checking the next mirror */
-		if (!sblock)
-			continue;
-
-		for (sector_index = 0; sector_index < sblock->sector_count;
-		     sector_index++) {
-			/*
-			 * Here we just cleanup the recover, each sector will be
-			 * properly cleaned up by later scrub_block_put()
-			 */
-			recover = sblock->sectors[sector_index]->recover;
-			if (recover) {
-				scrub_put_recover(fs_info, recover);
-				sblock->sectors[sector_index]->recover = NULL;
-			}
-		}
-		scrub_block_put(sblock);
-	}
-
-	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
-	memalloc_nofs_restore(nofs_flag);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
-static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
+static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
 {
-	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
-		return 2;
-	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
-		return 3;
-	else
-		return (int)bioc->num_stripes;
-}
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
 
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
-						 u64 *raid_map,
-						 int nstripes, int mirror,
-						 int *stripe_index,
-						 u64 *stripe_offset)
-{
-	int i;
-
-	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		/* RAID5/6 */
-		for (i = 0; i < nstripes; i++) {
-			if (raid_map[i] == RAID6_Q_STRIPE ||
-			    raid_map[i] == RAID5_P_STRIPE)
-				continue;
-
-			if (logical >= raid_map[i] &&
-			    logical < raid_map[i] + BTRFS_STRIPE_LEN)
-				break;
-		}
-
-		*stripe_index = i;
-		*stripe_offset = logical - raid_map[i];
-	} else {
-		/* The other RAID type */
-		*stripe_index = mirror;
-		*stripe_offset = 0;
-	}
+	return stripe->pages[page_index];
 }
 
-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
-				     struct scrub_block *sblocks_for_recheck[])
+static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
+						 int sector_nr)
 {
-	struct scrub_ctx *sctx = original_sblock->sctx;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	u64 logical = original_sblock->logical;
-	u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
-	u64 generation = original_sblock->sectors[0]->generation;
-	u64 flags = original_sblock->sectors[0]->flags;
-	u64 have_csum = original_sblock->sectors[0]->have_csum;
-	struct scrub_recover *recover;
-	struct btrfs_io_context *bioc;
-	u64 sublen;
-	u64 mapped_length;
-	u64 stripe_offset;
-	int stripe_index;
-	int sector_index = 0;
-	int mirror_index;
-	int nmirrors;
-	int ret;
-
-	while (length > 0) {
-		sublen = min_t(u64, length, fs_info->sectorsize);
-		mapped_length = sublen;
-		bioc = NULL;
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 
-		/*
-		 * With a length of sectorsize, each returned stripe represents
-		 * one mirror
-		 */
-		btrfs_bio_counter_inc_blocked(fs_info);
-		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-				       logical, &mapped_length, &bioc);
-		if (ret || !bioc || mapped_length < sublen) {
-			btrfs_put_bioc(bioc);
-			btrfs_bio_counter_dec(fs_info);
-			return -EIO;
-		}
-
-		recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
-		if (!recover) {
-			btrfs_put_bioc(bioc);
-			btrfs_bio_counter_dec(fs_info);
-			return -ENOMEM;
-		}
-
-		refcount_set(&recover->refs, 1);
-		recover->bioc = bioc;
-		recover->map_length = mapped_length;
-
-		ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
-
-		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
-
-		for (mirror_index = 0; mirror_index < nmirrors;
-		     mirror_index++) {
-			struct scrub_block *sblock;
-			struct scrub_sector *sector;
-
-			sblock = sblocks_for_recheck[mirror_index];
-			sblock->sctx = sctx;
-
-			sector = alloc_scrub_sector(sblock, logical);
-			if (!sector) {
-				spin_lock(&sctx->stat_lock);
-				sctx->stat.malloc_errors++;
-				spin_unlock(&sctx->stat_lock);
-				scrub_put_recover(fs_info, recover);
-				return -ENOMEM;
-			}
-			sector->flags = flags;
-			sector->generation = generation;
-			sector->have_csum = have_csum;
-			if (have_csum)
-				memcpy(sector->csum,
-				       original_sblock->sectors[0]->csum,
-				       sctx->fs_info->csum_size);
-
-			scrub_stripe_index_and_offset(logical,
-						      bioc->map_type,
-						      bioc->raid_map,
-						      bioc->num_stripes -
-						      bioc->num_tgtdevs,
-						      mirror_index,
-						      &stripe_index,
-						      &stripe_offset);
-			/*
-			 * We're at the first sector, also populate @sblock
-			 * physical and dev.
-			 */
-			if (sector_index == 0) {
-				sblock->physical =
-					bioc->stripes[stripe_index].physical +
-					stripe_offset;
-				sblock->dev = bioc->stripes[stripe_index].dev;
-				sblock->physical_for_dev_replace =
-					original_sblock->physical_for_dev_replace;
-			}
-
-			BUG_ON(sector_index >= original_sblock->sector_count);
-			scrub_get_recover(recover);
-			sector->recover = recover;
-		}
-		scrub_put_recover(fs_info, recover);
-		length -= sublen;
-		logical += sublen;
-		sector_index++;
-	}
-
-	return 0;
-}
-
-static void scrub_bio_wait_endio(struct bio *bio)
-{
-	complete(bio->bi_private);
+	return offset_in_page(sector_nr << fs_info->sectorsize_bits);
 }
 
-static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
-					struct bio *bio,
-					struct scrub_sector *sector)
+static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
-				 SECTOR_SHIFT;
-	bio->bi_private = &done;
-	bio->bi_end_io = scrub_bio_wait_endio;
-	raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
-
-	wait_for_completion_io(&done);
-	return blk_status_to_errno(bio->bi_status);
-}
-
-static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
-					  struct scrub_block *sblock)
-{
-	struct scrub_sector *first_sector = sblock->sectors[0];
-	struct bio *bio;
-	int i;
-
-	/* All sectors in sblock belong to the same stripe on the same device. */
-	ASSERT(sblock->dev);
-	if (!sblock->dev->bdev)
-		goto out;
-
-	bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
-
-	for (i = 0; i < sblock->sector_count; i++) {
-		struct scrub_sector *sector = sblock->sectors[i];
-
-		bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
-	}
-
-	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
-		bio_put(bio);
-		goto out;
-	}
-
-	bio_put(bio);
-
-	scrub_recheck_block_checksum(sblock);
-
-	return;
-out:
-	for (i = 0; i < sblock->sector_count; i++)
-		sblock->sectors[i]->io_error = 1;
-
-	sblock->no_io_error_seen = 0;
-}
-
-/*
- * This function will check the on disk data for checksum errors, header errors
- * and read I/O errors. If any I/O errors happen, the exact sectors which are
- * errored are marked as being bad. The goal is to enable scrub to take those
- * sectors that are not errored from all the mirrors so that the sectors that
- * are errored in the just handled mirror can be repaired.
- */
-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-				struct scrub_block *sblock,
-				int retry_failed_mirror)
-{
-	int i;
-
-	sblock->no_io_error_seen = 1;
-
-	/* short cut for raid56 */
-	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
-		return scrub_recheck_block_on_raid56(fs_info, sblock);
-
-	for (i = 0; i < sblock->sector_count; i++) {
-		struct scrub_sector *sector = sblock->sectors[i];
-		struct bio bio;
-		struct bio_vec bvec;
-
-		if (sblock->dev->bdev == NULL) {
-			sector->io_error = 1;
-			sblock->no_io_error_seen = 0;
-			continue;
-		}
-
-		bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
-		bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
-		bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
-					SECTOR_SHIFT;
-
-		btrfsic_check_bio(&bio);
-		if (submit_bio_wait(&bio)) {
-			sector->io_error = 1;
-			sblock->no_io_error_seen = 0;
-		}
-
-		bio_uninit(&bio);
-	}
-
-	if (sblock->no_io_error_seen)
-		scrub_recheck_block_checksum(sblock);
-}
-
-static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
-{
-	struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
-	int ret;
-
-	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
-	return !ret;
-}
-
-static void scrub_recheck_block_checksum(struct scrub_block *sblock)
-{
-	sblock->header_error = 0;
-	sblock->checksum_error = 0;
-	sblock->generation_error = 0;
-
-	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
-		scrub_checksum_data(sblock);
-	else
-		scrub_checksum_tree_block(sblock);
-}
-
-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-					     struct scrub_block *sblock_good)
-{
-	int i;
-	int ret = 0;
-
-	for (i = 0; i < sblock_bad->sector_count; i++) {
-		int ret_sub;
-
-		ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
-							     sblock_good, i, 1);
-		if (ret_sub)
-			ret = ret_sub;
-	}
-
-	return ret;
-}
-
-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
-					      struct scrub_block *sblock_good,
-					      int sector_num, int force_write)
-{
-	struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
-	struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
-	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
-	const u32 sectorsize = fs_info->sectorsize;
-
-	if (force_write || sblock_bad->header_error ||
-	    sblock_bad->checksum_error || sector_bad->io_error) {
-		struct bio bio;
-		struct bio_vec bvec;
-		int ret;
-
-		if (!sblock_bad->dev->bdev) {
-			btrfs_warn_rl(fs_info,
-				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
-			return -EIO;
-		}
-
-		bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
-		bio.bi_iter.bi_sector = (sblock_bad->physical +
-					 sector_bad->offset) >> SECTOR_SHIFT;
-		ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
-
-		btrfsic_check_bio(&bio);
-		ret = submit_bio_wait(&bio);
-		bio_uninit(&bio);
-
-		if (ret) {
-			btrfs_dev_stat_inc_and_print(sblock_bad->dev,
-				BTRFS_DEV_STAT_WRITE_ERRS);
-			atomic64_inc(&fs_info->dev_replace.num_write_errors);
-			return -EIO;
-		}
-	}
-
-	return 0;
-}
-
-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
-{
-	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
-	int i;
-
-	/*
-	 * This block is used for the check of the parity on the source device,
-	 * so the data needn't be written into the destination device.
-	 */
-	if (sblock->sparity)
-		return;
-
-	for (i = 0; i < sblock->sector_count; i++) {
-		int ret;
-
-		ret = scrub_write_sector_to_dev_replace(sblock, i);
-		if (ret)
-			atomic64_inc(&fs_info->dev_replace.num_write_errors);
-	}
-}
-
-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
-{
-	const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
-	struct scrub_sector *sector = sblock->sectors[sector_num];
-
-	if (sector->io_error)
-		memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
-
-	return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
-}
-
-static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
-{
-	int ret = 0;
-	u64 length;
-
-	if (!btrfs_is_zoned(sctx->fs_info))
-		return 0;
-
-	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
-		return 0;
-
-	if (sctx->write_pointer < physical) {
-		length = physical - sctx->write_pointer;
-
-		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
-						sctx->write_pointer, length);
-		if (!ret)
-			sctx->write_pointer = physical;
-	}
-	return ret;
-}
-
-static void scrub_block_get(struct scrub_block *sblock)
-{
-	refcount_inc(&sblock->refs);
-}
-
-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
-				      struct scrub_sector *sector)
-{
-	struct scrub_block *sblock = sector->sblock;
-	struct scrub_bio *sbio;
-	int ret;
-	const u32 sectorsize = sctx->fs_info->sectorsize;
-
-	mutex_lock(&sctx->wr_lock);
-again:
-	if (!sctx->wr_curr_bio) {
-		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
-					      GFP_KERNEL);
-		if (!sctx->wr_curr_bio) {
-			mutex_unlock(&sctx->wr_lock);
-			return -ENOMEM;
-		}
-		sctx->wr_curr_bio->sctx = sctx;
-		sctx->wr_curr_bio->sector_count = 0;
-	}
-	sbio = sctx->wr_curr_bio;
-	if (sbio->sector_count == 0) {
-		ret = fill_writer_pointer_gap(sctx, sector->offset +
-					      sblock->physical_for_dev_replace);
-		if (ret) {
-			mutex_unlock(&sctx->wr_lock);
-			return ret;
-		}
-
-		sbio->physical = sblock->physical_for_dev_replace + sector->offset;
-		sbio->logical = sblock->logical + sector->offset;
-		sbio->dev = sctx->wr_tgtdev;
-		if (!sbio->bio) {
-			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
-					      REQ_OP_WRITE, GFP_NOFS);
-		}
-		sbio->bio->bi_private = sbio;
-		sbio->bio->bi_end_io = scrub_wr_bio_end_io;
-		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
-		sbio->status = 0;
-	} else if (sbio->physical + sbio->sector_count * sectorsize !=
-		   sblock->physical_for_dev_replace + sector->offset ||
-		   sbio->logical + sbio->sector_count * sectorsize !=
-		   sblock->logical + sector->offset) {
-		scrub_wr_submit(sctx);
-		goto again;
-	}
-
-	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
-	if (ret != sectorsize) {
-		if (sbio->sector_count < 1) {
-			bio_put(sbio->bio);
-			sbio->bio = NULL;
-			mutex_unlock(&sctx->wr_lock);
-			return -EIO;
-		}
-		scrub_wr_submit(sctx);
-		goto again;
-	}
-
-	sbio->sectors[sbio->sector_count] = sector;
-	scrub_sector_get(sector);
-	/*
-	 * Since ssector no longer holds a page, but uses sblock::pages, we
-	 * have to ensure the sblock had not been freed before our write bio
-	 * finished.
-	 */
-	scrub_block_get(sector->sblock);
-
-	sbio->sector_count++;
-	if (sbio->sector_count == sctx->sectors_per_bio)
-		scrub_wr_submit(sctx);
-	mutex_unlock(&sctx->wr_lock);
-
-	return 0;
-}
-
-static void scrub_wr_submit(struct scrub_ctx *sctx)
-{
-	struct scrub_bio *sbio;
-
-	if (!sctx->wr_curr_bio)
-		return;
-
-	sbio = sctx->wr_curr_bio;
-	sctx->wr_curr_bio = NULL;
-	scrub_pending_bio_inc(sctx);
-	/* process all writes in a single worker thread. Then the block layer
-	 * orders the requests before sending them to the driver which
-	 * doubled the write performance on spinning disks when measured
-	 * with Linux 3.5 */
-	btrfsic_check_bio(sbio->bio);
-	submit_bio(sbio->bio);
-
-	if (btrfs_is_zoned(sctx->fs_info))
-		sctx->write_pointer = sbio->physical + sbio->sector_count *
-			sctx->fs_info->sectorsize;
-}
-
-static void scrub_wr_bio_end_io(struct bio *bio)
-{
-	struct scrub_bio *sbio = bio->bi_private;
-	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
-
-	sbio->status = bio->bi_status;
-	sbio->bio = bio;
-
-	INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
-	queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
-}
-
-static void scrub_wr_bio_end_io_worker(struct work_struct *work)
-{
-	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-	struct scrub_ctx *sctx = sbio->sctx;
-	int i;
-
-	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
-	if (sbio->status) {
-		struct btrfs_dev_replace *dev_replace =
-			&sbio->sctx->fs_info->dev_replace;
-
-		for (i = 0; i < sbio->sector_count; i++) {
-			struct scrub_sector *sector = sbio->sectors[i];
-
-			sector->io_error = 1;
-			atomic64_inc(&dev_replace->num_write_errors);
-		}
-	}
-
-	/*
-	 * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
-	 * endio we should put the sblock.
-	 */
-	for (i = 0; i < sbio->sector_count; i++) {
-		scrub_block_put(sbio->sectors[i]->sblock);
-		scrub_sector_put(sbio->sectors[i]);
-	}
-
-	bio_put(sbio->bio);
-	kfree(sbio);
-	scrub_pending_bio_dec(sctx);
-}
-
-static int scrub_checksum(struct scrub_block *sblock)
-{
-	u64 flags;
-	int ret;
-
-	/*
-	 * No need to initialize these stats currently,
-	 * because this function only use return value
-	 * instead of these stats value.
-	 *
-	 * Todo:
-	 * always use stats
-	 */
-	sblock->header_error = 0;
-	sblock->generation_error = 0;
-	sblock->checksum_error = 0;
-
-	WARN_ON(sblock->sector_count < 1);
-	flags = sblock->sectors[0]->flags;
-	ret = 0;
-	if (flags & BTRFS_EXTENT_FLAG_DATA)
-		ret = scrub_checksum_data(sblock);
-	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-		ret = scrub_checksum_tree_block(sblock);
-	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
-		ret = scrub_checksum_super(sblock);
-	else
-		WARN_ON(1);
-	if (ret)
-		scrub_handle_errored_block(sblock);
-
-	return ret;
-}
-
-static int scrub_checksum_data(struct scrub_block *sblock)
-{
-	struct scrub_ctx *sctx = sblock->sctx;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+	const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
+	const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
+	const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	u8 csum[BTRFS_CSUM_SIZE];
-	struct scrub_sector *sector;
-	char *kaddr;
-
-	BUG_ON(sblock->sector_count < 1);
-	sector = sblock->sectors[0];
-	if (!sector->have_csum)
-		return 0;
-
-	kaddr = scrub_sector_get_kaddr(sector);
-
-	shash->tfm = fs_info->csum_shash;
-	crypto_shash_init(shash);
-
-	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
-
-	if (memcmp(csum, sector->csum, fs_info->csum_size))
-		sblock->checksum_error = 1;
-	return sblock->checksum_error;
-}
-
-static int scrub_checksum_tree_block(struct scrub_block *sblock)
-{
-	struct scrub_ctx *sctx = sblock->sctx;
-	struct btrfs_header *h;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
-	/*
-	 * This is done in sectorsize steps even for metadata as there's a
-	 * constraint for nodesize to be aligned to sectorsize. This will need
-	 * to change so we don't misuse data and metadata units like that.
-	 */
-	const u32 sectorsize = sctx->fs_info->sectorsize;
-	const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
-	int i;
-	struct scrub_sector *sector;
-	char *kaddr;
-
-	BUG_ON(sblock->sector_count < 1);
-
-	/* Each member in sectors is just one sector */
-	ASSERT(sblock->sector_count == num_sectors);
-
-	sector = sblock->sectors[0];
-	kaddr = scrub_sector_get_kaddr(sector);
-	h = (struct btrfs_header *)kaddr;
-	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	struct btrfs_header *header;
 
 	/*
-	 * we don't use the getter functions here, as we
-	 * a) don't have an extent buffer and
-	 * b) the page is already kmapped
+	 * Here we don't have a good way to attach the pages (and subpages)
+	 * to a dummy extent buffer, thus we have to directly grab the members
+	 * from pages.
 	 */
-	if (sblock->logical != btrfs_stack_header_bytenr(h)) {
-		sblock->header_error = 1;
+	header = (struct btrfs_header *)(page_address(first_page) + first_off);
+	memcpy(on_disk_csum, header->csum, fs_info->csum_size);
+
+	if (logical != btrfs_stack_header_bytenr(header)) {
+		bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
 		btrfs_warn_rl(fs_info,
 		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
-			      sblock->logical, sblock->mirror_num,
-			      btrfs_stack_header_bytenr(h),
-			      sblock->logical);
-		goto out;
+			      logical, stripe->mirror_num,
+			      btrfs_stack_header_bytenr(header), logical);
+		return;
 	}
-
-	if (!scrub_check_fsid(h->fsid, sector)) {
-		sblock->header_error = 1;
+	if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
+		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
 		btrfs_warn_rl(fs_info,
 		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
-			      sblock->logical, sblock->mirror_num,
-			      h->fsid, sblock->dev->fs_devices->fsid);
-		goto out;
+			      logical, stripe->mirror_num,
+			      header->fsid, fs_info->fs_devices->fsid);
+		return;
 	}
-
-	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
-		sblock->header_error = 1;
+	if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+		   BTRFS_UUID_SIZE) != 0) {
+		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
 		btrfs_warn_rl(fs_info,
 		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
-			      sblock->logical, sblock->mirror_num,
-			      h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
-		goto out;
+			      logical, stripe->mirror_num,
+			      header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+		return;
 	}
 
+	/* Now check tree block csum. */
 	shash->tfm = fs_info->csum_shash;
 	crypto_shash_init(shash);
-	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
-			    sectorsize - BTRFS_CSUM_SIZE);
+	crypto_shash_update(shash, page_address(first_page) + first_off +
+			    BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+
+	for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
+		struct page *page = scrub_stripe_get_page(stripe, i);
+		unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
 
-	for (i = 1; i < num_sectors; i++) {
-		kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
-		crypto_shash_update(shash, kaddr, sectorsize);
+		crypto_shash_update(shash, page_address(page) + page_off,
+				    fs_info->sectorsize);
 	}
 
 	crypto_shash_final(shash, calculated_csum);
-	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
-		sblock->checksum_error = 1;
+	if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
+		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
 		btrfs_warn_rl(fs_info,
 		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
-			      sblock->logical, sblock->mirror_num,
+			      logical, stripe->mirror_num,
 			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
 			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
-		goto out;
+		return;
 	}
-
-	if (sector->generation != btrfs_stack_header_generation(h)) {
-		sblock->header_error = 1;
-		sblock->generation_error = 1;
+	if (stripe->sectors[sector_nr].generation !=
+	    btrfs_stack_header_generation(header)) {
+		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
 		btrfs_warn_rl(fs_info,
 		"tree block %llu mirror %u has bad generation, has %llu want %llu",
-			      sblock->logical, sblock->mirror_num,
-			      btrfs_stack_header_generation(h),
-			      sector->generation);
-	}
-
-out:
-	return sblock->header_error || sblock->checksum_error;
-}
-
-static int scrub_checksum_super(struct scrub_block *sblock)
-{
-	struct btrfs_super_block *s;
-	struct scrub_ctx *sctx = sblock->sctx;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	u8 calculated_csum[BTRFS_CSUM_SIZE];
-	struct scrub_sector *sector;
-	char *kaddr;
-	int fail_gen = 0;
-	int fail_cor = 0;
-
-	BUG_ON(sblock->sector_count < 1);
-	sector = sblock->sectors[0];
-	kaddr = scrub_sector_get_kaddr(sector);
-	s = (struct btrfs_super_block *)kaddr;
-
-	if (sblock->logical != btrfs_super_bytenr(s))
-		++fail_cor;
-
-	if (sector->generation != btrfs_super_generation(s))
-		++fail_gen;
-
-	if (!scrub_check_fsid(s->fsid, sector))
-		++fail_cor;
-
-	shash->tfm = fs_info->csum_shash;
-	crypto_shash_init(shash);
-	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
-			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
-
-	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
-		++fail_cor;
-
-	return fail_cor + fail_gen;
-}
-
-static void scrub_block_put(struct scrub_block *sblock)
-{
-	if (refcount_dec_and_test(&sblock->refs)) {
-		int i;
-
-		if (sblock->sparity)
-			scrub_parity_put(sblock->sparity);
-
-		for (i = 0; i < sblock->sector_count; i++)
-			scrub_sector_put(sblock->sectors[i]);
-		for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
-			if (sblock->pages[i]) {
-				detach_scrub_page_private(sblock->pages[i]);
-				__free_page(sblock->pages[i]);
-			}
-		}
-		kfree(sblock);
+			      logical, stripe->mirror_num,
+			      btrfs_stack_header_generation(header),
+			      stripe->sectors[sector_nr].generation);
+		return;
 	}
+	bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+	bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+	bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
 }
 
-static void scrub_sector_get(struct scrub_sector *sector)
-{
-	atomic_inc(&sector->refs);
-}
-
-static void scrub_sector_put(struct scrub_sector *sector)
+static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
 {
-	if (atomic_dec_and_test(&sector->refs))
-		kfree(sector);
-}
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
+	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+	struct page *page = scrub_stripe_get_page(stripe, sector_nr);
+	unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+	u8 csum_buf[BTRFS_CSUM_SIZE];
+	int ret;
 
-/*
- * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
- * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
- */
-static void scrub_throttle(struct scrub_ctx *sctx)
-{
-	const int time_slice = 1000;
-	struct scrub_bio *sbio;
-	struct btrfs_device *device;
-	s64 delta;
-	ktime_t now;
-	u32 div;
-	u64 bwlimit;
+	ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
 
-	sbio = sctx->bios[sctx->curr];
-	device = sbio->dev;
-	bwlimit = READ_ONCE(device->scrub_speed_max);
-	if (bwlimit == 0)
+	/* Sector not utilized, skip it. */
+	if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
 		return;
 
-	/*
-	 * Slice is divided into intervals when the IO is submitted, adjust by
-	 * bwlimit and maximum of 64 intervals.
-	 */
-	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
-	div = min_t(u32, 64, div);
-
-	/* Start new epoch, set deadline */
-	now = ktime_get();
-	if (sctx->throttle_deadline == 0) {
-		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
-		sctx->throttle_sent = 0;
-	}
+	/* IO error, no need to check. */
+	if (test_bit(sector_nr, &stripe->io_error_bitmap))
+		return;
 
-	/* Still in the time to send? */
-	if (ktime_before(now, sctx->throttle_deadline)) {
-		/* If current bio is within the limit, send it */
-		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
-		if (sctx->throttle_sent <= div_u64(bwlimit, div))
+	/* Metadata, verify the full tree block. */
+	if (sector->is_metadata) {
+		/*
+		 * Check if the tree block crosses the stripe boudary.  If
+		 * crossed the boundary, we cannot verify it but only give a
+		 * warning.
+		 *
+		 * This can only happen on a very old filesystem where chunks
+		 * are not ensured to be stripe aligned.
+		 */
+		if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
+			btrfs_warn_rl(fs_info,
+			"tree block at %llu crosses stripe boundary %llu",
+				      stripe->logical +
+				      (sector_nr << fs_info->sectorsize_bits),
+				      stripe->logical);
 			return;
-
-		/* We're over the limit, sleep until the rest of the slice */
-		delta = ktime_ms_delta(sctx->throttle_deadline, now);
-	} else {
-		/* New request after deadline, start new epoch */
-		delta = 0;
-	}
-
-	if (delta) {
-		long timeout;
-
-		timeout = div_u64(delta * HZ, 1000);
-		schedule_timeout_interruptible(timeout);
-	}
-
-	/* Next call will start the deadline period */
-	sctx->throttle_deadline = 0;
-}
-
-static void scrub_submit(struct scrub_ctx *sctx)
-{
-	struct scrub_bio *sbio;
-
-	if (sctx->curr == -1)
+		}
+		scrub_verify_one_metadata(stripe, sector_nr);
 		return;
+	}
 
-	scrub_throttle(sctx);
-
-	sbio = sctx->bios[sctx->curr];
-	sctx->curr = -1;
-	scrub_pending_bio_inc(sctx);
-	btrfsic_check_bio(sbio->bio);
-	submit_bio(sbio->bio);
-}
-
-static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
-				      struct scrub_sector *sector)
-{
-	struct scrub_block *sblock = sector->sblock;
-	struct scrub_bio *sbio;
-	const u32 sectorsize = sctx->fs_info->sectorsize;
-	int ret;
-
-again:
 	/*
-	 * grab a fresh bio or wait for one to become available
+	 * Data is easier, we just verify the data csum (if we have it).  For
+	 * cases without csum, we have no other choice but to trust it.
 	 */
-	while (sctx->curr == -1) {
-		spin_lock(&sctx->list_lock);
-		sctx->curr = sctx->first_free;
-		if (sctx->curr != -1) {
-			sctx->first_free = sctx->bios[sctx->curr]->next_free;
-			sctx->bios[sctx->curr]->next_free = -1;
-			sctx->bios[sctx->curr]->sector_count = 0;
-			spin_unlock(&sctx->list_lock);
-		} else {
-			spin_unlock(&sctx->list_lock);
-			wait_event(sctx->list_wait, sctx->first_free != -1);
-		}
-	}
-	sbio = sctx->bios[sctx->curr];
-	if (sbio->sector_count == 0) {
-		sbio->physical = sblock->physical + sector->offset;
-		sbio->logical = sblock->logical + sector->offset;
-		sbio->dev = sblock->dev;
-		if (!sbio->bio) {
-			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
-					      REQ_OP_READ, GFP_NOFS);
-		}
-		sbio->bio->bi_private = sbio;
-		sbio->bio->bi_end_io = scrub_bio_end_io;
-		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
-		sbio->status = 0;
-	} else if (sbio->physical + sbio->sector_count * sectorsize !=
-		   sblock->physical + sector->offset ||
-		   sbio->logical + sbio->sector_count * sectorsize !=
-		   sblock->logical + sector->offset ||
-		   sbio->dev != sblock->dev) {
-		scrub_submit(sctx);
-		goto again;
+	if (!sector->csum) {
+		clear_bit(sector_nr, &stripe->error_bitmap);
+		return;
 	}
 
-	sbio->sectors[sbio->sector_count] = sector;
-	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
-	if (ret != sectorsize) {
-		if (sbio->sector_count < 1) {
-			bio_put(sbio->bio);
-			sbio->bio = NULL;
-			return -EIO;
-		}
-		scrub_submit(sctx);
-		goto again;
+	ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
+	if (ret < 0) {
+		set_bit(sector_nr, &stripe->csum_error_bitmap);
+		set_bit(sector_nr, &stripe->error_bitmap);
+	} else {
+		clear_bit(sector_nr, &stripe->csum_error_bitmap);
+		clear_bit(sector_nr, &stripe->error_bitmap);
 	}
-
-	scrub_block_get(sblock); /* one for the page added to the bio */
-	atomic_inc(&sblock->outstanding_sectors);
-	sbio->sector_count++;
-	if (sbio->sector_count == sctx->sectors_per_bio)
-		scrub_submit(sctx);
-
-	return 0;
-}
-
-static void scrub_missing_raid56_end_io(struct bio *bio)
-{
-	struct scrub_block *sblock = bio->bi_private;
-	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
-
-	btrfs_bio_counter_dec(fs_info);
-	if (bio->bi_status)
-		sblock->no_io_error_seen = 0;
-
-	bio_put(bio);
-
-	queue_work(fs_info->scrub_workers, &sblock->work);
 }
 
-static void scrub_missing_raid56_worker(struct work_struct *work)
+/* Verify specified sectors of a stripe. */
+static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
 {
-	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
-	struct scrub_ctx *sctx = sblock->sctx;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	u64 logical;
-	struct btrfs_device *dev;
-
-	logical = sblock->logical;
-	dev = sblock->dev;
-
-	if (sblock->no_io_error_seen)
-		scrub_recheck_block_checksum(sblock);
-
-	if (!sblock->no_io_error_seen) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.read_errors++;
-		spin_unlock(&sctx->stat_lock);
-		btrfs_err_rl_in_rcu(fs_info,
-			"IO error rebuilding logical %llu for dev %s",
-			logical, btrfs_dev_name(dev));
-	} else if (sblock->header_error || sblock->checksum_error) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.uncorrectable_errors++;
-		spin_unlock(&sctx->stat_lock);
-		btrfs_err_rl_in_rcu(fs_info,
-			"failed to rebuild valid logical %llu for dev %s",
-			logical, btrfs_dev_name(dev));
-	} else {
-		scrub_write_block_to_dev_replace(sblock);
-	}
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+	int sector_nr;
 
-	if (sctx->is_dev_replace && sctx->flush_all_writes) {
-		mutex_lock(&sctx->wr_lock);
-		scrub_wr_submit(sctx);
-		mutex_unlock(&sctx->wr_lock);
+	for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
+		scrub_verify_one_sector(stripe, sector_nr);
+		if (stripe->sectors[sector_nr].is_metadata)
+			sector_nr += sectors_per_tree - 1;
 	}
-
-	scrub_block_put(sblock);
-	scrub_pending_bio_dec(sctx);
 }
 
-static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
 {
-	struct scrub_ctx *sctx = sblock->sctx;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	u64 length = sblock->sector_count << fs_info->sectorsize_bits;
-	u64 logical = sblock->logical;
-	struct btrfs_io_context *bioc = NULL;
-	struct bio *bio;
-	struct btrfs_raid_bio *rbio;
-	int ret;
 	int i;
 
-	btrfs_bio_counter_inc_blocked(fs_info);
-	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-			       &length, &bioc);
-	if (ret || !bioc || !bioc->raid_map)
-		goto bioc_out;
-
-	if (WARN_ON(!sctx->is_dev_replace ||
-		    !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
-		/*
-		 * We shouldn't be scrubbing a missing device. Even for dev
-		 * replace, we should only get here for RAID 5/6. We either
-		 * managed to mount something with no mirrors remaining or
-		 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
-		 */
-		goto bioc_out;
-	}
-
-	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
-	bio->bi_iter.bi_sector = logical >> 9;
-	bio->bi_private = sblock;
-	bio->bi_end_io = scrub_missing_raid56_end_io;
-
-	rbio = raid56_alloc_missing_rbio(bio, bioc);
-	if (!rbio)
-		goto rbio_out;
-
-	for (i = 0; i < sblock->sector_count; i++) {
-		struct scrub_sector *sector = sblock->sectors[i];
-
-		raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
-				       scrub_sector_get_page_offset(sector),
-				       sector->offset + sector->sblock->logical);
+	for (i = 0; i < stripe->nr_sectors; i++) {
+		if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
+		    scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
+			break;
 	}
-
-	INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
-	scrub_block_get(sblock);
-	scrub_pending_bio_inc(sctx);
-	raid56_submit_missing_rbio(rbio);
-	btrfs_put_bioc(bioc);
-	return;
-
-rbio_out:
-	bio_put(bio);
-bioc_out:
-	btrfs_bio_counter_dec(fs_info);
-	btrfs_put_bioc(bioc);
-	spin_lock(&sctx->stat_lock);
-	sctx->stat.malloc_errors++;
-	spin_unlock(&sctx->stat_lock);
+	ASSERT(i < stripe->nr_sectors);
+	return i;
 }
 
-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
-		       u64 physical, struct btrfs_device *dev, u64 flags,
-		       u64 gen, int mirror_num, u8 *csum,
-		       u64 physical_for_dev_replace)
+/*
+ * Repair read is different to the regular read:
+ *
+ * - Only reads the failed sectors
+ * - May have extra blocksize limits
+ */
+static void scrub_repair_read_endio(struct btrfs_bio *bbio)
 {
-	struct scrub_block *sblock;
-	const u32 sectorsize = sctx->fs_info->sectorsize;
-	int index;
-
-	sblock = alloc_scrub_block(sctx, dev, logical, physical,
-				   physical_for_dev_replace, mirror_num);
-	if (!sblock) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.malloc_errors++;
-		spin_unlock(&sctx->stat_lock);
-		return -ENOMEM;
-	}
+	struct scrub_stripe *stripe = bbio->private;
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct bio_vec *bvec;
+	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+	u32 bio_size = 0;
+	int i;
 
-	for (index = 0; len > 0; index++) {
-		struct scrub_sector *sector;
-		/*
-		 * Here we will allocate one page for one sector to scrub.
-		 * This is fine if PAGE_SIZE == sectorsize, but will cost
-		 * more memory for PAGE_SIZE > sectorsize case.
-		 */
-		u32 l = min(sectorsize, len);
+	ASSERT(sector_nr < stripe->nr_sectors);
 
-		sector = alloc_scrub_sector(sblock, logical);
-		if (!sector) {
-			spin_lock(&sctx->stat_lock);
-			sctx->stat.malloc_errors++;
-			spin_unlock(&sctx->stat_lock);
-			scrub_block_put(sblock);
-			return -ENOMEM;
-		}
-		sector->flags = flags;
-		sector->generation = gen;
-		if (csum) {
-			sector->have_csum = 1;
-			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
-		} else {
-			sector->have_csum = 0;
-		}
-		len -= l;
-		logical += l;
-		physical += l;
-		physical_for_dev_replace += l;
-	}
+	bio_for_each_bvec_all(bvec, &bbio->bio, i)
+		bio_size += bvec->bv_len;
 
-	WARN_ON(sblock->sector_count == 0);
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
-		/*
-		 * This case should only be hit for RAID 5/6 device replace. See
-		 * the comment in scrub_missing_raid56_pages() for details.
-		 */
-		scrub_missing_raid56_pages(sblock);
+	if (bbio->bio.bi_status) {
+		bitmap_set(&stripe->io_error_bitmap, sector_nr,
+			   bio_size >> fs_info->sectorsize_bits);
+		bitmap_set(&stripe->error_bitmap, sector_nr,
+			   bio_size >> fs_info->sectorsize_bits);
 	} else {
-		for (index = 0; index < sblock->sector_count; index++) {
-			struct scrub_sector *sector = sblock->sectors[index];
-			int ret;
-
-			ret = scrub_add_sector_to_rd_bio(sctx, sector);
-			if (ret) {
-				scrub_block_put(sblock);
-				return ret;
-			}
-		}
-
-		if (flags & BTRFS_EXTENT_FLAG_SUPER)
-			scrub_submit(sctx);
+		bitmap_clear(&stripe->io_error_bitmap, sector_nr,
+			     bio_size >> fs_info->sectorsize_bits);
 	}
-
-	/* last one frees, either here or in bio completion for last page */
-	scrub_block_put(sblock);
-	return 0;
+	bio_put(&bbio->bio);
+	if (atomic_dec_and_test(&stripe->pending_io))
+		wake_up(&stripe->io_wait);
 }
 
-static void scrub_bio_end_io(struct bio *bio)
+static int calc_next_mirror(int mirror, int num_copies)
 {
-	struct scrub_bio *sbio = bio->bi_private;
-	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
-
-	sbio->status = bio->bi_status;
-	sbio->bio = bio;
-
-	queue_work(fs_info->scrub_workers, &sbio->work);
+	ASSERT(mirror <= num_copies);
+	return (mirror + 1 > num_copies) ? 1 : mirror + 1;
 }
 
-static void scrub_bio_end_io_worker(struct work_struct *work)
+static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
+					    int mirror, int blocksize, bool wait)
 {
-	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-	struct scrub_ctx *sctx = sbio->sctx;
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct btrfs_bio *bbio = NULL;
+	const unsigned long old_error_bitmap = stripe->error_bitmap;
 	int i;
 
-	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
-	if (sbio->status) {
-		for (i = 0; i < sbio->sector_count; i++) {
-			struct scrub_sector *sector = sbio->sectors[i];
+	ASSERT(stripe->mirror_num >= 1);
+	ASSERT(atomic_read(&stripe->pending_io) == 0);
+
+	for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
+		struct page *page;
+		int pgoff;
+		int ret;
 
-			sector->io_error = 1;
-			sector->sblock->no_io_error_seen = 0;
+		page = scrub_stripe_get_page(stripe, i);
+		pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+		/* The current sector cannot be merged, submit the bio. */
+		if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
+			     bbio->bio.bi_iter.bi_size >= blocksize)) {
+			ASSERT(bbio->bio.bi_iter.bi_size);
+			atomic_inc(&stripe->pending_io);
+			btrfs_submit_bio(bbio, mirror);
+			if (wait)
+				wait_scrub_stripe_io(stripe);
+			bbio = NULL;
 		}
-	}
 
-	/* Now complete the scrub_block items that have all pages completed */
-	for (i = 0; i < sbio->sector_count; i++) {
-		struct scrub_sector *sector = sbio->sectors[i];
-		struct scrub_block *sblock = sector->sblock;
+		if (!bbio) {
+			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+				fs_info, scrub_repair_read_endio, stripe);
+			bbio->bio.bi_iter.bi_sector = (stripe->logical +
+				(i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
+		}
 
-		if (atomic_dec_and_test(&sblock->outstanding_sectors))
-			scrub_block_complete(sblock);
-		scrub_block_put(sblock);
+		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+		ASSERT(ret == fs_info->sectorsize);
 	}
-
-	bio_put(sbio->bio);
-	sbio->bio = NULL;
-	spin_lock(&sctx->list_lock);
-	sbio->next_free = sctx->first_free;
-	sctx->first_free = sbio->index;
-	spin_unlock(&sctx->list_lock);
-
-	if (sctx->is_dev_replace && sctx->flush_all_writes) {
-		mutex_lock(&sctx->wr_lock);
-		scrub_wr_submit(sctx);
-		mutex_unlock(&sctx->wr_lock);
+	if (bbio) {
+		ASSERT(bbio->bio.bi_iter.bi_size);
+		atomic_inc(&stripe->pending_io);
+		btrfs_submit_bio(bbio, mirror);
+		if (wait)
+			wait_scrub_stripe_io(stripe);
 	}
-
-	scrub_pending_bio_dec(sctx);
 }
 
-static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
-				       unsigned long *bitmap,
-				       u64 start, u32 len)
+static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
+				       struct scrub_stripe *stripe)
 {
-	u64 offset;
-	u32 nsectors;
-	u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
-
-	if (len >= sparity->stripe_len) {
-		bitmap_set(bitmap, 0, sparity->nsectors);
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct btrfs_device *dev = NULL;
+	u64 physical = 0;
+	int nr_data_sectors = 0;
+	int nr_meta_sectors = 0;
+	int nr_nodatacsum_sectors = 0;
+	int nr_repaired_sectors = 0;
+	int sector_nr;
+
+	if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
 		return;
-	}
 
-	start -= sparity->logic_start;
-	start = div64_u64_rem(start, sparity->stripe_len, &offset);
-	offset = offset >> sectorsize_bits;
-	nsectors = len >> sectorsize_bits;
+	/*
+	 * Init needed infos for error reporting.
+	 *
+	 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+	 * thus no need for dev/physical, error reporting still needs dev and physical.
+	 */
+	if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
+		u64 mapped_len = fs_info->sectorsize;
+		struct btrfs_io_context *bioc = NULL;
+		int stripe_index = stripe->mirror_num - 1;
+		int ret;
 
-	if (offset + nsectors <= sparity->nsectors) {
-		bitmap_set(bitmap, offset, nsectors);
-		return;
+		/* For scrub, our mirror_num should always start at 1. */
+		ASSERT(stripe->mirror_num >= 1);
+		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+				       stripe->logical, &mapped_len, &bioc);
+		/*
+		 * If we failed, dev will be NULL, and later detailed reports
+		 * will just be skipped.
+		 */
+		if (ret < 0)
+			goto skip;
+		physical = bioc->stripes[stripe_index].physical;
+		dev = bioc->stripes[stripe_index].dev;
+		btrfs_put_bioc(bioc);
 	}
 
-	bitmap_set(bitmap, offset, sparity->nsectors - offset);
-	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
-}
+skip:
+	for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+		bool repaired = false;
 
-static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
-						   u64 start, u32 len)
-{
-	__scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
-}
+		if (stripe->sectors[sector_nr].is_metadata) {
+			nr_meta_sectors++;
+		} else {
+			nr_data_sectors++;
+			if (!stripe->sectors[sector_nr].csum)
+				nr_nodatacsum_sectors++;
+		}
 
-static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
-						  u64 start, u32 len)
-{
-	__scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
-}
+		if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
+		    !test_bit(sector_nr, &stripe->error_bitmap)) {
+			nr_repaired_sectors++;
+			repaired = true;
+		}
 
-static void scrub_block_complete(struct scrub_block *sblock)
-{
-	int corrupted = 0;
+		/* Good sector from the beginning, nothing need to be done. */
+		if (!test_bit(sector_nr, &stripe->init_error_bitmap))
+			continue;
 
-	if (!sblock->no_io_error_seen) {
-		corrupted = 1;
-		scrub_handle_errored_block(sblock);
-	} else {
 		/*
-		 * if has checksum error, write via repair mechanism in
-		 * dev replace case, otherwise write here in dev replace
-		 * case.
+		 * Report error for the corrupted sectors.  If repaired, just
+		 * output the message of repaired message.
 		 */
-		corrupted = scrub_checksum(sblock);
-		if (!corrupted && sblock->sctx->is_dev_replace)
-			scrub_write_block_to_dev_replace(sblock);
-	}
+		if (repaired) {
+			if (dev) {
+				btrfs_err_rl_in_rcu(fs_info,
+			"fixed up error at logical %llu on dev %s physical %llu",
+					    stripe->logical, btrfs_dev_name(dev),
+					    physical);
+			} else {
+				btrfs_err_rl_in_rcu(fs_info,
+			"fixed up error at logical %llu on mirror %u",
+					    stripe->logical, stripe->mirror_num);
+			}
+			continue;
+		}
 
-	if (sblock->sparity && corrupted && !sblock->data_corrected) {
-		u64 start = sblock->logical;
-		u64 end = sblock->logical +
-			  sblock->sectors[sblock->sector_count - 1]->offset +
-			  sblock->sctx->fs_info->sectorsize;
+		/* The remaining are all for unrepaired. */
+		if (dev) {
+			btrfs_err_rl_in_rcu(fs_info,
+	"unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+					    stripe->logical, btrfs_dev_name(dev),
+					    physical);
+		} else {
+			btrfs_err_rl_in_rcu(fs_info,
+	"unable to fixup (regular) error at logical %llu on mirror %u",
+					    stripe->logical, stripe->mirror_num);
+		}
 
-		ASSERT(end - start <= U32_MAX);
-		scrub_parity_mark_sectors_error(sblock->sparity,
-						start, end - start);
+		if (test_bit(sector_nr, &stripe->io_error_bitmap))
+			if (__ratelimit(&rs) && dev)
+				scrub_print_common_warning("i/o error", dev, false,
+						     stripe->logical, physical);
+		if (test_bit(sector_nr, &stripe->csum_error_bitmap))
+			if (__ratelimit(&rs) && dev)
+				scrub_print_common_warning("checksum error", dev, false,
+						     stripe->logical, physical);
+		if (test_bit(sector_nr, &stripe->meta_error_bitmap))
+			if (__ratelimit(&rs) && dev)
+				scrub_print_common_warning("header error", dev, false,
+						     stripe->logical, physical);
 	}
-}
 
-static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
-{
-	sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
-	list_del(&sum->list);
-	kfree(sum);
+	spin_lock(&sctx->stat_lock);
+	sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
+	sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
+	sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
+	sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
+	sctx->stat.no_csum += nr_nodatacsum_sectors;
+	sctx->stat.read_errors +=
+		bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
+	sctx->stat.csum_errors +=
+		bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
+	sctx->stat.verify_errors +=
+		bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
+	sctx->stat.uncorrectable_errors +=
+		bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
+	sctx->stat.corrected_errors += nr_repaired_sectors;
+	spin_unlock(&sctx->stat_lock);
 }
 
 /*
- * Find the desired csum for range [logical, logical + sectorsize), and store
- * the csum into @csum.
+ * The main entrance for all read related scrub work, including:
  *
- * The search source is sctx->csum_list, which is a pre-populated list
- * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
- * that is before @logical.
+ * - Wait for the initial read to finish
+ * - Verify and locate any bad sectors
+ * - Go through the remaining mirrors and try to read as large blocksize as
+ *   possible
+ * - Go through all mirrors (including the failed mirror) sector-by-sector
  *
- * Return 0 if there is no csum for the range.
- * Return 1 if there is csum for the range and copied to @csum.
+ * Writeback does not happen here, it needs extra synchronization.
  */
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
+static void scrub_stripe_read_repair_worker(struct work_struct *work)
 {
-	bool found = false;
-
-	while (!list_empty(&sctx->csum_list)) {
-		struct btrfs_ordered_sum *sum = NULL;
-		unsigned long index;
-		unsigned long num_sectors;
-
-		sum = list_first_entry(&sctx->csum_list,
-				       struct btrfs_ordered_sum, list);
-		/* The current csum range is beyond our range, no csum found */
-		if (sum->bytenr > logical)
-			break;
-
-		/*
-		 * The current sum is before our bytenr, since scrub is always
-		 * done in bytenr order, the csum will never be used anymore,
-		 * clean it up so that later calls won't bother with the range,
-		 * and continue search the next range.
-		 */
-		if (sum->bytenr + sum->len <= logical) {
-			drop_csum_range(sctx, sum);
-			continue;
-		}
-
-		/* Now the csum range covers our bytenr, copy the csum */
-		found = true;
-		index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
-		num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
+	struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
+					  stripe->bg->length);
+	int mirror;
+	int i;
 
-		memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
-		       sctx->fs_info->csum_size);
+	ASSERT(stripe->mirror_num > 0);
 
-		/* Cleanup the range if we're at the end of the csum range */
-		if (index == num_sectors - 1)
-			drop_csum_range(sctx, sum);
-		break;
-	}
-	if (!found)
-		return 0;
-	return 1;
-}
+	wait_scrub_stripe_io(stripe);
+	scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
+	/* Save the initial failed bitmap for later repair and report usage. */
+	stripe->init_error_bitmap = stripe->error_bitmap;
 
-/* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
-			u64 logical, u32 len,
-			u64 physical, struct btrfs_device *dev, u64 flags,
-			u64 gen, int mirror_num)
-{
-	struct btrfs_device *src_dev = dev;
-	u64 src_physical = physical;
-	int src_mirror = mirror_num;
-	int ret;
-	u8 csum[BTRFS_CSUM_SIZE];
-	u32 blocksize;
+	if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
+		goto out;
 
-	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-			blocksize = map->stripe_len;
-		else
-			blocksize = sctx->fs_info->sectorsize;
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.data_extents_scrubbed++;
-		sctx->stat.data_bytes_scrubbed += len;
-		spin_unlock(&sctx->stat_lock);
-	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-			blocksize = map->stripe_len;
-		else
-			blocksize = sctx->fs_info->nodesize;
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.tree_extents_scrubbed++;
-		sctx->stat.tree_bytes_scrubbed += len;
-		spin_unlock(&sctx->stat_lock);
-	} else {
-		blocksize = sctx->fs_info->sectorsize;
-		WARN_ON(1);
+	/*
+	 * Try all remaining mirrors.
+	 *
+	 * Here we still try to read as large block as possible, as this is
+	 * faster and we have extra safety nets to rely on.
+	 */
+	for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
+	     mirror != stripe->mirror_num;
+	     mirror = calc_next_mirror(mirror, num_copies)) {
+		const unsigned long old_error_bitmap = stripe->error_bitmap;
+
+		scrub_stripe_submit_repair_read(stripe, mirror,
+						BTRFS_STRIPE_LEN, false);
+		wait_scrub_stripe_io(stripe);
+		scrub_verify_one_stripe(stripe, old_error_bitmap);
+		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+			goto out;
 	}
 
 	/*
-	 * For dev-replace case, we can have @dev being a missing device.
-	 * Regular scrub will avoid its execution on missing device at all,
-	 * as that would trigger tons of read error.
+	 * Last safety net, try re-checking all mirrors, including the failed
+	 * one, sector-by-sector.
 	 *
-	 * Reading from missing device will cause read error counts to
-	 * increase unnecessarily.
-	 * So here we change the read source to a good mirror.
+	 * As if one sector failed the drive's internal csum, the whole read
+	 * containing the offending sector would be marked as error.
+	 * Thus here we do sector-by-sector read.
+	 *
+	 * This can be slow, thus we only try it as the last resort.
 	 */
-	if (sctx->is_dev_replace && !dev->bdev)
-		scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
-				     &src_dev, &src_mirror);
-	while (len) {
-		u32 l = min(len, blocksize);
-		int have_csum = 0;
-
-		if (flags & BTRFS_EXTENT_FLAG_DATA) {
-			/* push csums to sbio */
-			have_csum = scrub_find_csum(sctx, logical, csum);
-			if (have_csum == 0)
-				++sctx->stat.no_csum;
-		}
-		ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
-				    flags, gen, src_mirror,
-				    have_csum ? csum : NULL, physical);
-		if (ret)
-			return ret;
-		len -= l;
-		logical += l;
-		physical += l;
-		src_physical += l;
+
+	for (i = 0, mirror = stripe->mirror_num;
+	     i < num_copies;
+	     i++, mirror = calc_next_mirror(mirror, num_copies)) {
+		const unsigned long old_error_bitmap = stripe->error_bitmap;
+
+		scrub_stripe_submit_repair_read(stripe, mirror,
+						fs_info->sectorsize, true);
+		wait_scrub_stripe_io(stripe);
+		scrub_verify_one_stripe(stripe, old_error_bitmap);
+		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+			goto out;
 	}
-	return 0;
+out:
+	scrub_stripe_report_errors(stripe->sctx, stripe);
+	set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
+	wake_up(&stripe->repair_wait);
 }
 
-static int scrub_sectors_for_parity(struct scrub_parity *sparity,
-				  u64 logical, u32 len,
-				  u64 physical, struct btrfs_device *dev,
-				  u64 flags, u64 gen, int mirror_num, u8 *csum)
+static void scrub_read_endio(struct btrfs_bio *bbio)
 {
-	struct scrub_ctx *sctx = sparity->sctx;
-	struct scrub_block *sblock;
-	const u32 sectorsize = sctx->fs_info->sectorsize;
-	int index;
-
-	ASSERT(IS_ALIGNED(len, sectorsize));
+	struct scrub_stripe *stripe = bbio->private;
 
-	sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
-	if (!sblock) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.malloc_errors++;
-		spin_unlock(&sctx->stat_lock);
-		return -ENOMEM;
+	if (bbio->bio.bi_status) {
+		bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+		bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+	} else {
+		bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+	}
+	bio_put(&bbio->bio);
+	if (atomic_dec_and_test(&stripe->pending_io)) {
+		wake_up(&stripe->io_wait);
+		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
 	}
+}
 
-	sblock->sparity = sparity;
-	scrub_parity_get(sparity);
+static void scrub_write_endio(struct btrfs_bio *bbio)
+{
+	struct scrub_stripe *stripe = bbio->private;
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct bio_vec *bvec;
+	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+	u32 bio_size = 0;
+	int i;
 
-	for (index = 0; len > 0; index++) {
-		struct scrub_sector *sector;
+	bio_for_each_bvec_all(bvec, &bbio->bio, i)
+		bio_size += bvec->bv_len;
 
-		sector = alloc_scrub_sector(sblock, logical);
-		if (!sector) {
-			spin_lock(&sctx->stat_lock);
-			sctx->stat.malloc_errors++;
-			spin_unlock(&sctx->stat_lock);
-			scrub_block_put(sblock);
-			return -ENOMEM;
-		}
-		sblock->sectors[index] = sector;
-		/* For scrub parity */
-		scrub_sector_get(sector);
-		list_add_tail(&sector->list, &sparity->sectors_list);
-		sector->flags = flags;
-		sector->generation = gen;
-		if (csum) {
-			sector->have_csum = 1;
-			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
-		} else {
-			sector->have_csum = 0;
-		}
+	if (bbio->bio.bi_status) {
+		unsigned long flags;
 
-		/* Iterate over the stripe range in sectorsize steps */
-		len -= sectorsize;
-		logical += sectorsize;
-		physical += sectorsize;
+		spin_lock_irqsave(&stripe->write_error_lock, flags);
+		bitmap_set(&stripe->write_error_bitmap, sector_nr,
+			   bio_size >> fs_info->sectorsize_bits);
+		spin_unlock_irqrestore(&stripe->write_error_lock, flags);
 	}
+	bio_put(&bbio->bio);
+
+	if (atomic_dec_and_test(&stripe->pending_io))
+		wake_up(&stripe->io_wait);
+}
+
+/*
+ * Submit the write bio(s) for the sectors specified by @write_bitmap.
+ *
+ * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
+ *
+ * - Only needs logical bytenr and mirror_num
+ *   Just like the scrub read path
+ *
+ * - Would only result in writes to the specified mirror
+ *   Unlike the regular writeback path, which would write back to all stripes
+ *
+ * - Handle dev-replace and read-repair writeback differently
+ */
+static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
+				unsigned long write_bitmap, bool dev_replace)
+{
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct btrfs_bio *bbio = NULL;
+	const bool zoned = btrfs_is_zoned(fs_info);
+	int sector_nr;
 
-	WARN_ON(sblock->sector_count == 0);
-	for (index = 0; index < sblock->sector_count; index++) {
-		struct scrub_sector *sector = sblock->sectors[index];
+	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
+		struct page *page = scrub_stripe_get_page(stripe, sector_nr);
+		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
 		int ret;
 
-		ret = scrub_add_sector_to_rd_bio(sctx, sector);
-		if (ret) {
-			scrub_block_put(sblock);
-			return ret;
+		/* We should only writeback sectors covered by an extent. */
+		ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
+
+		/* Cannot merge with previous sector, submit the current one. */
+		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
+			fill_writer_pointer_gap(sctx, stripe->physical +
+					(sector_nr << fs_info->sectorsize_bits));
+			atomic_inc(&stripe->pending_io);
+			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
+			/* For zoned writeback, queue depth must be 1. */
+			if (zoned)
+				wait_scrub_stripe_io(stripe);
+			bbio = NULL;
 		}
+		if (!bbio) {
+			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
+					       fs_info, scrub_write_endio, stripe);
+			bbio->bio.bi_iter.bi_sector = (stripe->logical +
+				(sector_nr << fs_info->sectorsize_bits)) >>
+				SECTOR_SHIFT;
+		}
+		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+		ASSERT(ret == fs_info->sectorsize);
+	}
+	if (bbio) {
+		fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
+					SECTOR_SHIFT);
+		atomic_inc(&stripe->pending_io);
+		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
+		if (zoned)
+			wait_scrub_stripe_io(stripe);
 	}
-
-	/* Last one frees, either here or in bio completion for last sector */
-	scrub_block_put(sblock);
-	return 0;
 }
 
-static int scrub_extent_for_parity(struct scrub_parity *sparity,
-				   u64 logical, u32 len,
-				   u64 physical, struct btrfs_device *dev,
-				   u64 flags, u64 gen, int mirror_num)
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
+				  unsigned int bio_size)
 {
-	struct scrub_ctx *sctx = sparity->sctx;
-	int ret;
-	u8 csum[BTRFS_CSUM_SIZE];
-	u32 blocksize;
+	const int time_slice = 1000;
+	s64 delta;
+	ktime_t now;
+	u32 div;
+	u64 bwlimit;
 
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
-		scrub_parity_mark_sectors_error(sparity, logical, len);
-		return 0;
+	bwlimit = READ_ONCE(device->scrub_speed_max);
+	if (bwlimit == 0)
+		return;
+
+	/*
+	 * Slice is divided into intervals when the IO is submitted, adjust by
+	 * bwlimit and maximum of 64 intervals.
+	 */
+	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+	div = min_t(u32, 64, div);
+
+	/* Start new epoch, set deadline */
+	now = ktime_get();
+	if (sctx->throttle_deadline == 0) {
+		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+		sctx->throttle_sent = 0;
 	}
 
-	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sparity->stripe_len;
-	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sparity->stripe_len;
+	/* Still in the time to send? */
+	if (ktime_before(now, sctx->throttle_deadline)) {
+		/* If current bio is within the limit, send it */
+		sctx->throttle_sent += bio_size;
+		if (sctx->throttle_sent <= div_u64(bwlimit, div))
+			return;
+
+		/* We're over the limit, sleep until the rest of the slice */
+		delta = ktime_ms_delta(sctx->throttle_deadline, now);
 	} else {
-		blocksize = sctx->fs_info->sectorsize;
-		WARN_ON(1);
+		/* New request after deadline, start new epoch */
+		delta = 0;
 	}
 
-	while (len) {
-		u32 l = min(len, blocksize);
-		int have_csum = 0;
+	if (delta) {
+		long timeout;
 
-		if (flags & BTRFS_EXTENT_FLAG_DATA) {
-			/* push csums to sbio */
-			have_csum = scrub_find_csum(sctx, logical, csum);
-			if (have_csum == 0)
-				goto skip;
-		}
-		ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
-					     flags, gen, mirror_num,
-					     have_csum ? csum : NULL);
-		if (ret)
-			return ret;
-skip:
-		len -= l;
-		logical += l;
-		physical += l;
+		timeout = div_u64(delta * HZ, 1000);
+		schedule_timeout_interruptible(timeout);
 	}
-	return 0;
+
+	/* Next call will start the deadline period */
+	sctx->throttle_deadline = 0;
 }
 
 /*
@@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
 {
 	int i;
 	int j = 0;
-	u64 stripe_nr;
 	u64 last_offset;
-	u32 stripe_index;
-	u32 rot;
 	const int data_stripes = nr_data_stripes(map);
 
 	last_offset = (physical - map->stripes[num].physical) * data_stripes;
@@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num,
 
 	*offset = last_offset;
 	for (i = 0; i < data_stripes; i++) {
-		*offset = last_offset + i * map->stripe_len;
+		u32 stripe_nr;
+		u32 stripe_index;
+		u32 rot;
 
-		stripe_nr = div64_u64(*offset, map->stripe_len);
-		stripe_nr = div_u64(stripe_nr, data_stripes);
+		*offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
+
+		stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
 
 		/* Work out the disk rotation on this stripe-set */
-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+		rot = stripe_nr % map->num_stripes;
+		stripe_nr /= map->num_stripes;
 		/* calculate which stripe this data locates */
 		rot += i;
 		stripe_index = rot % map->num_stripes;
@@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num,
 		if (stripe_index < num)
 			j++;
 	}
-	*offset = last_offset + j * map->stripe_len;
+	*offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
 	return 1;
 }
 
-static void scrub_free_parity(struct scrub_parity *sparity)
-{
-	struct scrub_ctx *sctx = sparity->sctx;
-	struct scrub_sector *curr, *next;
-	int nbits;
-
-	nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
-	if (nbits) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.read_errors += nbits;
-		sctx->stat.uncorrectable_errors += nbits;
-		spin_unlock(&sctx->stat_lock);
-	}
-
-	list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
-		list_del_init(&curr->list);
-		scrub_sector_put(curr);
-	}
-
-	kfree(sparity);
-}
-
-static void scrub_parity_bio_endio_worker(struct work_struct *work)
-{
-	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
-						    work);
-	struct scrub_ctx *sctx = sparity->sctx;
-
-	btrfs_bio_counter_dec(sctx->fs_info);
-	scrub_free_parity(sparity);
-	scrub_pending_bio_dec(sctx);
-}
-
-static void scrub_parity_bio_endio(struct bio *bio)
-{
-	struct scrub_parity *sparity = bio->bi_private;
-	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
-
-	if (bio->bi_status)
-		bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
-			  &sparity->dbitmap, sparity->nsectors);
-
-	bio_put(bio);
-
-	INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
-	queue_work(fs_info->scrub_parity_workers, &sparity->work);
-}
-
-static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
-{
-	struct scrub_ctx *sctx = sparity->sctx;
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct bio *bio;
-	struct btrfs_raid_bio *rbio;
-	struct btrfs_io_context *bioc = NULL;
-	u64 length;
-	int ret;
-
-	if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
-			   &sparity->ebitmap, sparity->nsectors))
-		goto out;
-
-	length = sparity->logic_end - sparity->logic_start;
-
-	btrfs_bio_counter_inc_blocked(fs_info);
-	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
-			       &length, &bioc);
-	if (ret || !bioc || !bioc->raid_map)
-		goto bioc_out;
-
-	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
-	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
-	bio->bi_private = sparity;
-	bio->bi_end_io = scrub_parity_bio_endio;
-
-	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
-					      sparity->scrub_dev,
-					      &sparity->dbitmap,
-					      sparity->nsectors);
-	btrfs_put_bioc(bioc);
-	if (!rbio)
-		goto rbio_out;
-
-	scrub_pending_bio_inc(sctx);
-	raid56_parity_submit_scrub_rbio(rbio);
-	return;
-
-rbio_out:
-	bio_put(bio);
-bioc_out:
-	btrfs_bio_counter_dec(fs_info);
-	bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
-		  sparity->nsectors);
-	spin_lock(&sctx->stat_lock);
-	sctx->stat.malloc_errors++;
-	spin_unlock(&sctx->stat_lock);
-out:
-	scrub_free_parity(sparity);
-}
-
-static void scrub_parity_get(struct scrub_parity *sparity)
-{
-	refcount_inc(&sparity->refs);
-}
-
-static void scrub_parity_put(struct scrub_parity *sparity)
-{
-	if (!refcount_dec_and_test(&sparity->refs))
-		return;
-
-	scrub_parity_check_and_repair(sparity);
-}
-
 /*
  * Return 0 if the extent item range covers any byte of the range.
  * Return <0 if the extent item is before @search_start.
@@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
 	*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
 }
 
-static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
-				      u64 boundary_start, u64 boudary_len)
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+					u64 physical, u64 physical_end)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	int ret = 0;
+
+	if (!btrfs_is_zoned(fs_info))
+		return 0;
+
+	mutex_lock(&sctx->wr_lock);
+	if (sctx->write_pointer < physical_end) {
+		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+						    physical,
+						    sctx->write_pointer);
+		if (ret)
+			btrfs_err(fs_info,
+				  "zoned: failed to recover write pointer");
+	}
+	mutex_unlock(&sctx->wr_lock);
+	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+
+	return ret;
+}
+
+static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
+				 struct scrub_stripe *stripe,
+				 u64 extent_start, u64 extent_len,
+				 u64 extent_flags, u64 extent_gen)
+{
+	for (u64 cur_logical = max(stripe->logical, extent_start);
+	     cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
+			       extent_start + extent_len);
+	     cur_logical += fs_info->sectorsize) {
+		const int nr_sector = (cur_logical - stripe->logical) >>
+				      fs_info->sectorsize_bits;
+		struct scrub_sector_verification *sector =
+						&stripe->sectors[nr_sector];
+
+		set_bit(nr_sector, &stripe->extent_sector_bitmap);
+		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			sector->is_metadata = true;
+			sector->generation = extent_gen;
+		}
+	}
+}
+
+static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
+{
+	stripe->extent_sector_bitmap = 0;
+	stripe->init_error_bitmap = 0;
+	stripe->error_bitmap = 0;
+	stripe->io_error_bitmap = 0;
+	stripe->csum_error_bitmap = 0;
+	stripe->meta_error_bitmap = 0;
+}
+
+/*
+ * Locate one stripe which has at least one extent in its range.
+ *
+ * Return 0 if found such stripe, and store its info into @stripe.
+ * Return >0 if there is no such stripe in the specified range.
+ * Return <0 for error.
+ */
+static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
+					struct btrfs_device *dev, u64 physical,
+					int mirror_num, u64 logical_start,
+					u32 logical_len,
+					struct scrub_stripe *stripe)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
+	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
+	const u64 logical_end = logical_start + logical_len;
+	struct btrfs_path path = { 0 };
+	u64 cur_logical = logical_start;
+	u64 stripe_end;
+	u64 extent_start;
+	u64 extent_len;
+	u64 extent_flags;
+	u64 extent_gen;
+	int ret;
+
+	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
+				   stripe->nr_sectors);
+	scrub_stripe_reset_bitmaps(stripe);
+
+	/* The range must be inside the bg. */
+	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+	path.search_commit_root = 1;
+	path.skip_locking = 1;
+
+	ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
+	/* Either error or not found. */
+	if (ret)
+		goto out;
+	get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+	if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		stripe->nr_meta_extents++;
+	if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
+		stripe->nr_data_extents++;
+	cur_logical = max(extent_start, cur_logical);
+
+	/*
+	 * Round down to stripe boundary.
+	 *
+	 * The extra calculation against bg->start is to handle block groups
+	 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
+	 */
+	stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
+			  bg->start;
+	stripe->physical = physical + stripe->logical - logical_start;
+	stripe->dev = dev;
+	stripe->bg = bg;
+	stripe->mirror_num = mirror_num;
+	stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
+
+	/* Fill the first extent info into stripe->sectors[] array. */
+	fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+			     extent_flags, extent_gen);
+	cur_logical = extent_start + extent_len;
+
+	/* Fill the extent info for the remaining sectors. */
+	while (cur_logical <= stripe_end) {
+		ret = find_first_extent_item(extent_root, &path, cur_logical,
+					     stripe_end - cur_logical + 1);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			ret = 0;
+			break;
+		}
+		get_extent_info(&path, &extent_start, &extent_len,
+				&extent_flags, &extent_gen);
+		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+			stripe->nr_meta_extents++;
+		if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
+			stripe->nr_data_extents++;
+		fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+				     extent_flags, extent_gen);
+		cur_logical = extent_start + extent_len;
+	}
+
+	/* Now fill the data csum. */
+	if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
+		int sector_nr;
+		unsigned long csum_bitmap = 0;
+
+		/* Csum space should have already been allocated. */
+		ASSERT(stripe->csums);
+
+		/*
+		 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
+		 * should contain at most 16 sectors.
+		 */
+		ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+
+		ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
+						stripe_end, stripe->csums,
+						&csum_bitmap, true);
+		if (ret < 0)
+			goto out;
+		if (ret > 0)
+			ret = 0;
+
+		for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
+			stripe->sectors[sector_nr].csum = stripe->csums +
+				sector_nr * fs_info->csum_size;
+		}
+	}
+	set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
+out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
+static void scrub_reset_stripe(struct scrub_stripe *stripe)
+{
+	scrub_stripe_reset_bitmaps(stripe);
+
+	stripe->nr_meta_extents = 0;
+	stripe->nr_data_extents = 0;
+	stripe->state = 0;
+
+	for (int i = 0; i < stripe->nr_sectors; i++) {
+		stripe->sectors[i].is_metadata = false;
+		stripe->sectors[i].csum = NULL;
+		stripe->sectors[i].generation = 0;
+	}
+}
+
+static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+				      struct scrub_stripe *stripe)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct btrfs_bio *bbio;
+	int mirror = stripe->mirror_num;
+
+	ASSERT(stripe->bg);
+	ASSERT(stripe->mirror_num > 0);
+	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+
+	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+			       scrub_read_endio, stripe);
+
+	/* Read the whole stripe. */
+	bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+	for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+		int ret;
+
+		ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+		/* We should have allocated enough bio vectors. */
+		ASSERT(ret == PAGE_SIZE);
+	}
+	atomic_inc(&stripe->pending_io);
+
+	/*
+	 * For dev-replace, either user asks to avoid the source dev, or
+	 * the device is missing, we try the next mirror instead.
+	 */
+	if (sctx->is_dev_replace &&
+	    (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
+	     !stripe->dev->bdev)) {
+		int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
+						  stripe->bg->length);
+
+		mirror = calc_next_mirror(mirror, num_copies);
+	}
+	btrfs_submit_bio(bbio, mirror);
+}
+
+static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
 {
-	return (extent_start < boundary_start &&
-		extent_start + extent_len > boundary_start) ||
-	       (extent_start < boundary_start + boudary_len &&
-		extent_start + extent_len > boundary_start + boudary_len);
+	int i;
+
+	for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
+		if (stripe->sectors[i].is_metadata) {
+			struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+
+			btrfs_err(fs_info,
+			"stripe %llu has unrepaired metadata sector at %llu",
+				  stripe->logical,
+				  stripe->logical + (i << fs_info->sectorsize_bits));
+			return true;
+		}
+	}
+	return false;
 }
 
-static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
-					       struct scrub_parity *sparity,
-					       struct map_lookup *map,
-					       struct btrfs_device *sdev,
-					       struct btrfs_path *path,
-					       u64 logical)
+static int flush_scrub_stripes(struct scrub_ctx *sctx)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
-	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
-	u64 cur_logical = logical;
-	int ret;
+	struct scrub_stripe *stripe;
+	const int nr_stripes = sctx->cur_stripe;
+	int ret = 0;
 
-	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+	if (!nr_stripes)
+		return 0;
 
-	/* Path must not be populated */
-	ASSERT(!path->nodes[0]);
+	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
 
-	while (cur_logical < logical + map->stripe_len) {
-		struct btrfs_io_context *bioc = NULL;
-		struct btrfs_device *extent_dev;
-		u64 extent_start;
-		u64 extent_size;
-		u64 mapped_length;
-		u64 extent_flags;
-		u64 extent_gen;
-		u64 extent_physical;
-		u64 extent_mirror_num;
-
-		ret = find_first_extent_item(extent_root, path, cur_logical,
-					     logical + map->stripe_len - cur_logical);
-		/* No more extent item in this data stripe */
-		if (ret > 0) {
-			ret = 0;
-			break;
-		}
-		if (ret < 0)
-			break;
-		get_extent_info(path, &extent_start, &extent_size, &extent_flags,
-				&extent_gen);
+	scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
+			      nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
+	for (int i = 0; i < nr_stripes; i++) {
+		stripe = &sctx->stripes[i];
+		scrub_submit_initial_read(sctx, stripe);
+	}
 
-		/* Metadata should not cross stripe boundaries */
-		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
-		    does_range_cross_boundary(extent_start, extent_size,
-					      logical, map->stripe_len)) {
-			btrfs_err(fs_info,
-	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
-				  extent_start, logical);
-			spin_lock(&sctx->stat_lock);
-			sctx->stat.uncorrectable_errors++;
-			spin_unlock(&sctx->stat_lock);
-			cur_logical += extent_size;
-			continue;
-		}
+	for (int i = 0; i < nr_stripes; i++) {
+		stripe = &sctx->stripes[i];
+
+		wait_event(stripe->repair_wait,
+			   test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
+	}
 
-		/* Skip hole range which doesn't have any extent */
-		cur_logical = max(extent_start, cur_logical);
+	/*
+	 * Submit the repaired sectors.  For zoned case, we cannot do repair
+	 * in-place, but queue the bg to be relocated.
+	 */
+	if (btrfs_is_zoned(fs_info)) {
+		for (int i = 0; i < nr_stripes; i++) {
+			stripe = &sctx->stripes[i];
 
-		/* Truncate the range inside this data stripe */
-		extent_size = min(extent_start + extent_size,
-				  logical + map->stripe_len) - cur_logical;
-		extent_start = cur_logical;
-		ASSERT(extent_size <= U32_MAX);
+			if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
+				btrfs_repair_one_zone(fs_info,
+						      sctx->stripes[0].bg->start);
+				break;
+			}
+		}
+	} else {
+		for (int i = 0; i < nr_stripes; i++) {
+			unsigned long repaired;
 
-		scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+			stripe = &sctx->stripes[i];
 
-		mapped_length = extent_size;
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
-				      &mapped_length, &bioc, 0);
-		if (!ret && (!bioc || mapped_length < extent_size))
-			ret = -EIO;
-		if (ret) {
-			btrfs_put_bioc(bioc);
-			scrub_parity_mark_sectors_error(sparity, extent_start,
-							extent_size);
-			break;
+			bitmap_andnot(&repaired, &stripe->init_error_bitmap,
+				      &stripe->error_bitmap, stripe->nr_sectors);
+			scrub_write_sectors(sctx, stripe, repaired, false);
 		}
-		extent_physical = bioc->stripes[0].physical;
-		extent_mirror_num = bioc->mirror_num;
-		extent_dev = bioc->stripes[0].dev;
-		btrfs_put_bioc(bioc);
+	}
 
-		ret = btrfs_lookup_csums_list(csum_root, extent_start,
-					      extent_start + extent_size - 1,
-					      &sctx->csum_list, 1, false);
-		if (ret) {
-			scrub_parity_mark_sectors_error(sparity, extent_start,
-							extent_size);
-			break;
+	/* Submit for dev-replace. */
+	if (sctx->is_dev_replace) {
+		/*
+		 * For dev-replace, if we know there is something wrong with
+		 * metadata, we should immedately abort.
+		 */
+		for (int i = 0; i < nr_stripes; i++) {
+			if (stripe_has_metadata_error(&sctx->stripes[i])) {
+				ret = -EIO;
+				goto out;
+			}
 		}
+		for (int i = 0; i < nr_stripes; i++) {
+			unsigned long good;
 
-		ret = scrub_extent_for_parity(sparity, extent_start,
-					      extent_size, extent_physical,
-					      extent_dev, extent_flags,
-					      extent_gen, extent_mirror_num);
-		scrub_free_csums(sctx);
+			stripe = &sctx->stripes[i];
 
-		if (ret) {
-			scrub_parity_mark_sectors_error(sparity, extent_start,
-							extent_size);
-			break;
+			ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
+
+			bitmap_andnot(&good, &stripe->extent_sector_bitmap,
+				      &stripe->error_bitmap, stripe->nr_sectors);
+			scrub_write_sectors(sctx, stripe, good, true);
 		}
+	}
 
-		cond_resched();
-		cur_logical += extent_size;
+	/* Wait for the above writebacks to finish. */
+	for (int i = 0; i < nr_stripes; i++) {
+		stripe = &sctx->stripes[i];
+
+		wait_scrub_stripe_io(stripe);
+		scrub_reset_stripe(stripe);
 	}
-	btrfs_release_path(path);
+out:
+	sctx->cur_stripe = 0;
 	return ret;
 }
 
-static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
-						  struct map_lookup *map,
-						  struct btrfs_device *sdev,
-						  u64 logic_start,
-						  u64 logic_end)
+static void raid56_scrub_wait_endio(struct bio *bio)
 {
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct btrfs_path *path;
-	u64 cur_logical;
+	complete(bio->bi_private);
+}
+
+static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
+			      struct btrfs_device *dev, int mirror_num,
+			      u64 logical, u32 length, u64 physical)
+{
+	struct scrub_stripe *stripe;
 	int ret;
-	struct scrub_parity *sparity;
-	int nsectors;
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.malloc_errors++;
-		spin_unlock(&sctx->stat_lock);
-		return -ENOMEM;
+	/* No available slot, submit all stripes and wait for them. */
+	if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
+		ret = flush_scrub_stripes(sctx);
+		if (ret < 0)
+			return ret;
 	}
-	path->search_commit_root = 1;
-	path->skip_locking = 1;
 
-	ASSERT(map->stripe_len <= U32_MAX);
-	nsectors = map->stripe_len >> fs_info->sectorsize_bits;
-	ASSERT(nsectors <= BITS_PER_LONG);
-	sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
-	if (!sparity) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.malloc_errors++;
-		spin_unlock(&sctx->stat_lock);
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
+	stripe = &sctx->stripes[sctx->cur_stripe];
+
+	/* We can queue one stripe using the remaining slot. */
+	scrub_reset_stripe(stripe);
+	ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
+					   logical, length, stripe);
+	/* Either >0 as no more extents or <0 for error. */
+	if (ret)
+		return ret;
+	sctx->cur_stripe++;
+	return 0;
+}
 
-	ASSERT(map->stripe_len <= U32_MAX);
-	sparity->stripe_len = map->stripe_len;
-	sparity->nsectors = nsectors;
-	sparity->sctx = sctx;
-	sparity->scrub_dev = sdev;
-	sparity->logic_start = logic_start;
-	sparity->logic_end = logic_end;
-	refcount_set(&sparity->refs, 1);
-	INIT_LIST_HEAD(&sparity->sectors_list);
+static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
+				      struct btrfs_device *scrub_dev,
+				      struct btrfs_block_group *bg,
+				      struct map_lookup *map,
+				      u64 full_stripe_start)
+{
+	DECLARE_COMPLETION_ONSTACK(io_done);
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct btrfs_raid_bio *rbio;
+	struct btrfs_io_context *bioc = NULL;
+	struct bio *bio;
+	struct scrub_stripe *stripe;
+	bool all_empty = true;
+	const int data_stripes = nr_data_stripes(map);
+	unsigned long extent_bitmap = 0;
+	u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
+	int ret;
 
-	ret = 0;
-	for (cur_logical = logic_start; cur_logical < logic_end;
-	     cur_logical += map->stripe_len) {
-		ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
-							  sdev, path, cur_logical);
+	ASSERT(sctx->raid56_data_stripes);
+
+	for (int i = 0; i < data_stripes; i++) {
+		int stripe_index;
+		int rot;
+		u64 physical;
+
+		stripe = &sctx->raid56_data_stripes[i];
+		rot = div_u64(full_stripe_start - bg->start,
+			      data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
+		stripe_index = (i + rot) % map->num_stripes;
+		physical = map->stripes[stripe_index].physical +
+			   (rot << BTRFS_STRIPE_LEN_SHIFT);
+
+		scrub_reset_stripe(stripe);
+		set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
+		ret = scrub_find_fill_first_stripe(bg,
+				map->stripes[stripe_index].dev, physical, 1,
+				full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
+				BTRFS_STRIPE_LEN, stripe);
 		if (ret < 0)
+			goto out;
+		/*
+		 * No extent in this data stripe, need to manually mark them
+		 * initialized to make later read submission happy.
+		 */
+		if (ret > 0) {
+			stripe->logical = full_stripe_start +
+					  (i << BTRFS_STRIPE_LEN_SHIFT);
+			stripe->dev = map->stripes[stripe_index].dev;
+			stripe->mirror_num = 1;
+			set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
+		}
+	}
+
+	/* Check if all data stripes are empty. */
+	for (int i = 0; i < data_stripes; i++) {
+		stripe = &sctx->raid56_data_stripes[i];
+		if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
+			all_empty = false;
 			break;
+		}
+	}
+	if (all_empty) {
+		ret = 0;
+		goto out;
 	}
 
-	scrub_parity_put(sparity);
-	scrub_submit(sctx);
-	mutex_lock(&sctx->wr_lock);
-	scrub_wr_submit(sctx);
-	mutex_unlock(&sctx->wr_lock);
+	for (int i = 0; i < data_stripes; i++) {
+		stripe = &sctx->raid56_data_stripes[i];
+		scrub_submit_initial_read(sctx, stripe);
+	}
+	for (int i = 0; i < data_stripes; i++) {
+		stripe = &sctx->raid56_data_stripes[i];
 
-	btrfs_free_path(path);
-	return ret < 0 ? ret : 0;
-}
+		wait_event(stripe->repair_wait,
+			   test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
+	}
+	/* For now, no zoned support for RAID56. */
+	ASSERT(!btrfs_is_zoned(sctx->fs_info));
 
-static void sync_replace_for_zoned(struct scrub_ctx *sctx)
-{
-	if (!btrfs_is_zoned(sctx->fs_info))
-		return;
+	/* Writeback for the repaired sectors. */
+	for (int i = 0; i < data_stripes; i++) {
+		unsigned long repaired;
 
-	sctx->flush_all_writes = true;
-	scrub_submit(sctx);
-	mutex_lock(&sctx->wr_lock);
-	scrub_wr_submit(sctx);
-	mutex_unlock(&sctx->wr_lock);
+		stripe = &sctx->raid56_data_stripes[i];
 
-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
-}
+		bitmap_andnot(&repaired, &stripe->init_error_bitmap,
+			      &stripe->error_bitmap, stripe->nr_sectors);
+		scrub_write_sectors(sctx, stripe, repaired, false);
+	}
 
-static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
-					u64 physical, u64 physical_end)
-{
-	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	int ret = 0;
+	/* Wait for the above writebacks to finish. */
+	for (int i = 0; i < data_stripes; i++) {
+		stripe = &sctx->raid56_data_stripes[i];
 
-	if (!btrfs_is_zoned(fs_info))
-		return 0;
+		wait_scrub_stripe_io(stripe);
+	}
 
-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+	/*
+	 * Now all data stripes are properly verified. Check if we have any
+	 * unrepaired, if so abort immediately or we could further corrupt the
+	 * P/Q stripes.
+	 *
+	 * During the loop, also populate extent_bitmap.
+	 */
+	for (int i = 0; i < data_stripes; i++) {
+		unsigned long error;
 
-	mutex_lock(&sctx->wr_lock);
-	if (sctx->write_pointer < physical_end) {
-		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
-						    physical,
-						    sctx->write_pointer);
-		if (ret)
+		stripe = &sctx->raid56_data_stripes[i];
+
+		/*
+		 * We should only check the errors where there is an extent.
+		 * As we may hit an empty data stripe while it's missing.
+		 */
+		bitmap_and(&error, &stripe->error_bitmap,
+			   &stripe->extent_sector_bitmap, stripe->nr_sectors);
+		if (!bitmap_empty(&error, stripe->nr_sectors)) {
 			btrfs_err(fs_info,
-				  "zoned: failed to recover write pointer");
+"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+				  full_stripe_start, i, stripe->nr_sectors,
+				  &error);
+			ret = -EIO;
+			goto out;
+		}
+		bitmap_or(&extent_bitmap, &extent_bitmap,
+			  &stripe->extent_sector_bitmap, stripe->nr_sectors);
 	}
-	mutex_unlock(&sctx->wr_lock);
-	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
 
+	/* Now we can check and regenerate the P/Q stripe. */
+	bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
+	bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
+	bio->bi_private = &io_done;
+	bio->bi_end_io = raid56_scrub_wait_endio;
+
+	btrfs_bio_counter_inc_blocked(fs_info);
+	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
+			       &length, &bioc);
+	if (ret < 0) {
+		btrfs_put_bioc(bioc);
+		btrfs_bio_counter_dec(fs_info);
+		goto out;
+	}
+	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
+				BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+	btrfs_put_bioc(bioc);
+	if (!rbio) {
+		ret = -ENOMEM;
+		btrfs_bio_counter_dec(fs_info);
+		goto out;
+	}
+	raid56_parity_submit_scrub_rbio(rbio);
+	wait_for_completion_io(&io_done);
+	ret = blk_status_to_errno(bio->bi_status);
+	bio_put(bio);
+	btrfs_bio_counter_dec(fs_info);
+
+out:
 	return ret;
 }
 
@@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
  * and @logical_length parameter.
  */
 static int scrub_simple_mirror(struct scrub_ctx *sctx,
-			       struct btrfs_root *extent_root,
-			       struct btrfs_root *csum_root,
 			       struct btrfs_block_group *bg,
 			       struct map_lookup *map,
 			       u64 logical_start, u64 logical_length,
@@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	const u64 logical_end = logical_start + logical_length;
 	/* An artificial limit, inherit from old scrub behavior */
-	const u32 max_length = SZ_64K;
 	struct btrfs_path path = { 0 };
 	u64 cur_logical = logical_start;
 	int ret;
@@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 	path.skip_locking = 1;
 	/* Go through each extent items inside the logical range */
 	while (cur_logical < logical_end) {
-		u64 extent_start;
-		u64 extent_len;
-		u64 extent_flags;
-		u64 extent_gen;
-		u64 scrub_len;
+		u64 cur_physical = physical + cur_logical - logical_start;
 
 		/* Canceled? */
 		if (atomic_read(&fs_info->scrub_cancel_req) ||
@@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 		/* Paused? */
 		if (atomic_read(&fs_info->scrub_pause_req)) {
 			/* Push queued extents */
-			sctx->flush_all_writes = true;
-			scrub_submit(sctx);
-			mutex_lock(&sctx->wr_lock);
-			scrub_wr_submit(sctx);
-			mutex_unlock(&sctx->wr_lock);
-			wait_event(sctx->list_wait,
-				   atomic_read(&sctx->bios_in_flight) == 0);
-			sctx->flush_all_writes = false;
 			scrub_blocked_if_needed(fs_info);
 		}
 		/* Block group removed? */
@@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 		}
 		spin_unlock(&bg->lock);
 
-		ret = find_first_extent_item(extent_root, &path, cur_logical,
-					     logical_end - cur_logical);
+		ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
+					 cur_logical, logical_end - cur_logical,
+					 cur_physical);
 		if (ret > 0) {
 			/* No more extent, just update the accounting */
 			sctx->stat.last_physical = physical + logical_length;
@@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 		}
 		if (ret < 0)
 			break;
-		get_extent_info(&path, &extent_start, &extent_len,
-				&extent_flags, &extent_gen);
-		/* Skip hole range which doesn't have any extent */
-		cur_logical = max(extent_start, cur_logical);
 
-		/*
-		 * Scrub len has three limits:
-		 * - Extent size limit
-		 * - Scrub range limit
-		 *   This is especially imporatant for RAID0/RAID10 to reuse
-		 *   this function
-		 * - Max scrub size limit
-		 */
-		scrub_len = min(min(extent_start + extent_len,
-				    logical_end), cur_logical + max_length) -
-			    cur_logical;
-
-		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
-			ret = btrfs_lookup_csums_list(csum_root, cur_logical,
-					cur_logical + scrub_len - 1,
-					&sctx->csum_list, 1, false);
-			if (ret)
-				break;
-		}
-		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
-		    does_range_cross_boundary(extent_start, extent_len,
-					      logical_start, logical_length)) {
-			btrfs_err(fs_info,
-"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
-				  extent_start, logical_start, logical_end);
-			spin_lock(&sctx->stat_lock);
-			sctx->stat.uncorrectable_errors++;
-			spin_unlock(&sctx->stat_lock);
-			cur_logical += scrub_len;
-			continue;
-		}
-		ret = scrub_extent(sctx, map, cur_logical, scrub_len,
-				   cur_logical - logical_start + physical,
-				   device, extent_flags, extent_gen,
-				   mirror_num);
-		scrub_free_csums(sctx);
-		if (ret)
-			break;
-		if (sctx->is_dev_replace)
-			sync_replace_for_zoned(sctx);
-		cur_logical += scrub_len;
+		ASSERT(sctx->cur_stripe > 0);
+		cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
+			      + BTRFS_STRIPE_LEN;
+
 		/* Don't hold CPU for too long time */
 		cond_resched();
 	}
@@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 			    BTRFS_BLOCK_GROUP_RAID10));
 
-	return map->num_stripes / map->sub_stripes * map->stripe_len;
+	return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
 }
 
 /* Get the logical bytenr for the stripe */
@@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
 	 * (stripe_index / sub_stripes) gives how many data stripes we need to
 	 * skip.
 	 */
-	return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+	return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
+	       bg->start;
 }
 
 /* Get the mirror number for the stripe */
@@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
 }
 
 static int scrub_simple_stripe(struct scrub_ctx *sctx,
-			       struct btrfs_root *extent_root,
-			       struct btrfs_root *csum_root,
 			       struct btrfs_block_group *bg,
 			       struct map_lookup *map,
 			       struct btrfs_device *device,
@@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
 		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
 		 * this stripe.
 		 */
-		ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
-					  cur_logical, map->stripe_len, device,
-					  cur_physical, mirror_num);
+		ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+					  BTRFS_STRIPE_LEN, device, cur_physical,
+					  mirror_num);
 		if (ret)
 			return ret;
 		/* Skip to next stripe which belongs to the target device */
 		cur_logical += logical_increment;
 		/* For physical offset, we just go to next stripe */
-		cur_physical += map->stripe_len;
+		cur_physical += BTRFS_STRIPE_LEN;
 	}
 	return ret;
 }
@@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   struct btrfs_device *scrub_dev,
 					   int stripe_index)
 {
-	struct btrfs_path *path;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct btrfs_root *root;
-	struct btrfs_root *csum_root;
-	struct blk_plug plug;
 	struct map_lookup *map = em->map_lookup;
 	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
 	const u64 chunk_logical = bg->start;
 	int ret;
+	int ret2;
 	u64 physical = map->stripes[stripe_index].physical;
 	const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
 	const u64 physical_end = physical + dev_stripe_len;
@@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	/* Offset inside the chunk */
 	u64 offset;
 	u64 stripe_logical;
-	u64 stripe_end;
 	int stop_loop = 0;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/*
-	 * work on commit root. The related disk blocks are static as
-	 * long as COW is applied. This means, it is save to rewrite
-	 * them to repair disk errors without any race conditions
-	 */
-	path->search_commit_root = 1;
-	path->skip_locking = 1;
-	path->reada = READA_FORWARD;
-
-	wait_event(sctx->list_wait,
-		   atomic_read(&sctx->bios_in_flight) == 0);
 	scrub_blocked_if_needed(fs_info);
 
-	root = btrfs_extent_root(fs_info, bg->start);
-	csum_root = btrfs_csum_root(fs_info, bg->start);
-
-	/*
-	 * collect all data csums for the stripe to avoid seeking during
-	 * the scrub. This might currently (crc32) end up to be about 1MB
-	 */
-	blk_start_plug(&plug);
-
 	if (sctx->is_dev_replace &&
 	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
 		mutex_lock(&sctx->wr_lock);
 		sctx->write_pointer = physical;
 		mutex_unlock(&sctx->wr_lock);
-		sctx->flush_all_writes = true;
 	}
 
+	/* Prepare the extra data stripes used by RAID56. */
+	if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		ASSERT(sctx->raid56_data_stripes == NULL);
+
+		sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
+						    sizeof(struct scrub_stripe),
+						    GFP_KERNEL);
+		if (!sctx->raid56_data_stripes) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		for (int i = 0; i < nr_data_stripes(map); i++) {
+			ret = init_scrub_stripe(fs_info,
+						&sctx->raid56_data_stripes[i]);
+			if (ret < 0)
+				goto out;
+			sctx->raid56_data_stripes[i].bg = bg;
+			sctx->raid56_data_stripes[i].sctx = sctx;
+		}
+	}
 	/*
 	 * There used to be a big double loop to handle all profiles using the
 	 * same routine, which grows larger and more gross over time.
@@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		 * Only @physical and @mirror_num needs to calculated using
 		 * @stripe_index.
 		 */
-		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
-				bg->start, bg->length, scrub_dev,
-				map->stripes[stripe_index].physical,
+		ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+				scrub_dev, map->stripes[stripe_index].physical,
 				stripe_index + 1);
 		offset = 0;
 		goto out;
 	}
 	if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
-		ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
-					  scrub_dev, stripe_index);
-		offset = map->stripe_len * (stripe_index / map->sub_stripes);
+		ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
+		offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
 		goto out;
 	}
 
@@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
 	/* Initialize @offset in case we need to go to out: label */
 	get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
-	increment = map->stripe_len * nr_data_stripes(map);
+	increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
 
 	/*
 	 * Due to the rotation, for RAID56 it's better to iterate each stripe
@@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		if (ret) {
 			/* it is parity strip */
 			stripe_logical += chunk_logical;
-			stripe_end = stripe_logical + increment;
-			ret = scrub_raid56_parity(sctx, map, scrub_dev,
-						  stripe_logical,
-						  stripe_end);
+			ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
+							 map, stripe_logical);
 			if (ret)
 				goto out;
 			goto next;
@@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		 * We can reuse scrub_simple_mirror() here, as the repair part
 		 * is still based on @mirror_num.
 		 */
-		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
-					  logical, map->stripe_len,
+		ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
 					  scrub_dev, physical, 1);
 		if (ret < 0)
 			goto out;
 next:
 		logical += increment;
-		physical += map->stripe_len;
+		physical += BTRFS_STRIPE_LEN;
 		spin_lock(&sctx->stat_lock);
 		if (stop_loop)
 			sctx->stat.last_physical =
@@ -3754,14 +2237,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			break;
 	}
 out:
-	/* push queued extents */
-	scrub_submit(sctx);
-	mutex_lock(&sctx->wr_lock);
-	scrub_wr_submit(sctx);
-	mutex_unlock(&sctx->wr_lock);
-
-	blk_finish_plug(&plug);
-	btrfs_free_path(path);
+	ret2 = flush_scrub_stripes(sctx);
+	if (!ret2)
+		ret = ret2;
+	if (sctx->raid56_data_stripes) {
+		for (int i = 0; i < nr_data_stripes(map); i++)
+			release_scrub_stripe(&sctx->raid56_data_stripes[i]);
+		kfree(sctx->raid56_data_stripes);
+		sctx->raid56_data_stripes = NULL;
+	}
 
 	if (sctx->is_dev_replace && ret >= 0) {
 		int ret2;
@@ -4086,39 +2570,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
 				  dev_extent_len);
-
-		/*
-		 * flush, submit all pending read and write bios, afterwards
-		 * wait for them.
-		 * Note that in the dev replace case, a read request causes
-		 * write requests that are submitted in the read completion
-		 * worker. Therefore in the current situation, it is required
-		 * that all write requests are flushed, so that all read and
-		 * write requests are really completed when bios_in_flight
-		 * changes to 0.
-		 */
-		sctx->flush_all_writes = true;
-		scrub_submit(sctx);
-		mutex_lock(&sctx->wr_lock);
-		scrub_wr_submit(sctx);
-		mutex_unlock(&sctx->wr_lock);
-
-		wait_event(sctx->list_wait,
-			   atomic_read(&sctx->bios_in_flight) == 0);
-
-		scrub_pause_on(fs_info);
-
-		/*
-		 * must be called before we decrease @scrub_paused.
-		 * make sure we don't block transaction commit while
-		 * we are waiting pending workers finished.
-		 */
-		wait_event(sctx->list_wait,
-			   atomic_read(&sctx->workers_pending) == 0);
-		sctx->flush_all_writes = false;
-
-		scrub_pause_off(fs_info);
-
 		if (sctx->is_dev_replace &&
 		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
 						      cache, found_key.offset))
@@ -4175,18 +2626,62 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	return ret;
 }
 
+static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
+			   struct page *page, u64 physical, u64 generation)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct bio_vec bvec;
+	struct bio bio;
+	struct btrfs_super_block *sb = page_address(page);
+	int ret;
+
+	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
+	bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
+	__bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
+	ret = submit_bio_wait(&bio);
+	bio_uninit(&bio);
+
+	if (ret < 0)
+		return ret;
+	ret = btrfs_check_super_csum(fs_info, sb);
+	if (ret != 0) {
+		btrfs_err_rl(fs_info,
+			"super block at physical %llu devid %llu has bad csum",
+			physical, dev->devid);
+		return -EIO;
+	}
+	if (btrfs_super_generation(sb) != generation) {
+		btrfs_err_rl(fs_info,
+"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+			     physical, dev->devid,
+			     btrfs_super_generation(sb), generation);
+		return -EUCLEAN;
+	}
+
+	return btrfs_validate_super(fs_info, sb, -1);
+}
+
 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 					   struct btrfs_device *scrub_dev)
 {
 	int	i;
 	u64	bytenr;
 	u64	gen;
-	int	ret;
+	int ret = 0;
+	struct page *page;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 
 	if (BTRFS_FS_ERROR(fs_info))
 		return -EROFS;
 
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
 	/* Seed devices of a new filesystem has their own generation. */
 	if (scrub_dev->fs_devices != fs_info->fs_devices)
 		gen = scrub_dev->generation;
@@ -4201,14 +2696,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 		if (!btrfs_check_super_location(scrub_dev, bytenr))
 			continue;
 
-		ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
-				    scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
-				    NULL, bytenr);
-		if (ret)
-			return ret;
+		ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
+		if (ret) {
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.super_errors++;
+			spin_unlock(&sctx->stat_lock);
+		}
 	}
-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
-
+	__free_page(page);
 	return 0;
 }
 
@@ -4219,20 +2714,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
 		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
 		struct workqueue_struct *scrub_wr_comp =
 						fs_info->scrub_wr_completion_workers;
-		struct workqueue_struct *scrub_parity =
-						fs_info->scrub_parity_workers;
 
 		fs_info->scrub_workers = NULL;
 		fs_info->scrub_wr_completion_workers = NULL;
-		fs_info->scrub_parity_workers = NULL;
 		mutex_unlock(&fs_info->scrub_lock);
 
 		if (scrub_workers)
 			destroy_workqueue(scrub_workers);
 		if (scrub_wr_comp)
 			destroy_workqueue(scrub_wr_comp);
-		if (scrub_parity)
-			destroy_workqueue(scrub_parity);
 	}
 }
 
@@ -4244,7 +2734,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 {
 	struct workqueue_struct *scrub_workers = NULL;
 	struct workqueue_struct *scrub_wr_comp = NULL;
-	struct workqueue_struct *scrub_parity = NULL;
 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
 	int ret = -ENOMEM;
@@ -4261,18 +2750,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	if (!scrub_wr_comp)
 		goto fail_scrub_wr_completion_workers;
 
-	scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
-	if (!scrub_parity)
-		goto fail_scrub_parity_workers;
-
 	mutex_lock(&fs_info->scrub_lock);
 	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
 		ASSERT(fs_info->scrub_workers == NULL &&
-		       fs_info->scrub_wr_completion_workers == NULL &&
-		       fs_info->scrub_parity_workers == NULL);
+		       fs_info->scrub_wr_completion_workers == NULL);
 		fs_info->scrub_workers = scrub_workers;
 		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
-		fs_info->scrub_parity_workers = scrub_parity;
 		refcount_set(&fs_info->scrub_workers_refcnt, 1);
 		mutex_unlock(&fs_info->scrub_lock);
 		return 0;
@@ -4282,8 +2765,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	mutex_unlock(&fs_info->scrub_lock);
 
 	ret = 0;
-	destroy_workqueue(scrub_parity);
-fail_scrub_parity_workers:
+
 	destroy_workqueue(scrub_wr_comp);
 fail_scrub_wr_completion_workers:
 	destroy_workqueue(scrub_workers);
@@ -4418,12 +2900,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		ret = scrub_enumerate_chunks(sctx, dev, start, end);
 	memalloc_nofs_restore(nofs_flag);
 
-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 
-	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
-
 	if (progress)
 		memcpy(progress, &sctx->stat, sizeof(*progress));
 
@@ -4548,28 +3027,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
 
 	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
 }
-
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
-				 u64 extent_logical, u32 extent_len,
-				 u64 *extent_physical,
-				 struct btrfs_device **extent_dev,
-				 int *extent_mirror_num)
-{
-	u64 mapped_length;
-	struct btrfs_io_context *bioc = NULL;
-	int ret;
-
-	mapped_length = extent_len;
-	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
-			      &mapped_length, &bioc, 0);
-	if (ret || !bioc || mapped_length < extent_len ||
-	    !bioc->stripes[0].dev->bdev) {
-		btrfs_put_bioc(bioc);
-		return;
-	}
-
-	*extent_physical = bioc->stripes[0].physical;
-	*extent_mirror_num = bioc->mirror_num;
-	*extent_dev = bioc->stripes[0].dev;
-	btrfs_put_bioc(bioc);
-}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3eecce86f63f..75e7fa337e66 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -537,7 +537,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 	up_read(&info->groups_sem);
 }
 
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
 					u64 to_reclaim)
 {
 	u64 bytes;
@@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 	return nr;
 }
 
+static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
+				       u64 to_reclaim)
+{
+	const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
+	u64 nr;
+
+	nr = div64_u64(to_reclaim, bytes);
+	if (!nr)
+		nr = 1;
+	return nr;
+}
+
 #define EXTENT_SIZE_PER_ITEM	SZ_256K
 
 /*
@@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 			break;
 		}
 		if (state == FLUSH_DELAYED_REFS_NR)
-			nr = calc_reclaim_items_nr(fs_info, num_bytes);
+			nr = calc_delayed_refs_nr(fs_info, num_bytes);
 		else
 			nr = 0;
 		btrfs_run_delayed_refs(trans, nr);
@@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 	struct reserve_ticket ticket;
 	u64 start_ns = 0;
 	u64 used;
-	int ret = 0;
+	int ret = -ENOSPC;
 	bool pending_tickets;
 
 	ASSERT(orig_bytes);
-	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+	/*
+	 * If have a transaction handle (current->journal_info != NULL), then
+	 * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
+	 * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
+	 * flushing methods can trigger transaction commits.
+	 */
+	if (current->journal_info) {
+		/* One assert per line for easier debugging. */
+		ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
+		ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
+		ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
+	}
 
 	if (flush == BTRFS_RESERVE_FLUSH_DATA)
 		async_work = &fs_info->async_data_reclaim_work;
@@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 		async_work = &fs_info->async_reclaim_work;
 
 	spin_lock(&space_info->lock);
-	ret = -ENOSPC;
 	used = btrfs_space_info_used(space_info, true);
 
 	/*
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 2033b71b18ce..0bb9d14e60a8 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum {
 	 * - Running delayed refs
 	 * - Running delalloc and waiting for ordered extents
 	 * - Allocating a new chunk
+	 * - Committing transaction
 	 */
 	BTRFS_RESERVE_FLUSH_EVICT,
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 06e91c02be9e..efeb1a9d040a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1162,6 +1162,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
+		btrfs_handle_fs_error(fs_info, err, NULL);
 		goto fail_close;
 	}
 
@@ -2422,7 +2423,7 @@ static int __init btrfs_print_mod_info(void)
 			", fsverity=no"
 #endif
 			;
-	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+	pr_info("Btrfs loaded%s\n", options);
 	return 0;
 }
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 37fc58a7f27e..25294e624851 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_BTRFS_DEBUG
+	if (thresh != 0 && (thresh > 100))
+		return -EINVAL;
+#else
 	if (thresh != 0 && (thresh <= 50 || thresh > 100))
 		return -EINVAL;
+#endif
 
 	WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
 
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index f2f2e11dac4c..ed0f36ae5346 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
 	em->map_lookup = map;
 
 	map->num_stripes = test->num_stripes;
-	map->stripe_len = BTRFS_STRIPE_LEN;
 	map->type = test->raid_type;
 
 	for (i = 0; i < map->num_stripes; i++) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b8d5b1fa9a03..8b6a99b8d7f6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 		/*
 		 * We want to reserve all the bytes we may need all at once, so
 		 * we only do 1 enospc flushing cycle per transaction start.  We
-		 * accomplish this by simply assuming we'll do 2 x num_items
-		 * worth of delayed refs updates in this trans handle, and
-		 * refill that amount for whatever is missing in the reserve.
+		 * accomplish this by simply assuming we'll do num_items worth
+		 * of delayed refs updates in this trans handle, and refill that
+		 * amount for whatever is missing in the reserve.
 		 */
 		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
 		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-		    btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
-			delayed_refs_bytes = num_bytes;
-			num_bytes <<= 1;
+		    !btrfs_block_rsv_full(delayed_refs_rsv)) {
+			delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
+									  num_items);
+			num_bytes += delayed_refs_bytes;
 		}
 
 		/*
@@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
 	wait_current_trans(fs_info);
 }
 
-static bool should_end_transaction(struct btrfs_trans_handle *trans)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-
-	if (btrfs_check_space_for_delayed_refs(fs_info))
-		return true;
-
-	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
-}
-
 bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
@@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 	    test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
 		return true;
 
-	return should_end_transaction(trans);
+	if (btrfs_check_space_for_delayed_refs(trans->fs_info))
+		return true;
+
+	return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50);
 }
 
 static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index baad1ed7e111..e2b54793bf0c 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 			  stripe_len);
 		return -EUCLEAN;
 	}
+	/*
+	 * We artificially limit the chunk size, so that the number of stripes
+	 * inside a chunk can be fit into a U32.  The current limit (256G) is
+	 * way too large for real world usage anyway, and it's also much larger
+	 * than our existing limit (10G).
+	 *
+	 * Thus it should be a good way to catch obvious bitflips.
+	 */
+	if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) {
+		chunk_err(leaf, chunk, logical,
+			  "chunk length too large: have %llu limit %llu",
+			  length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT);
+		return -EUCLEAN;
+	}
 	if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
 			      BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
 		chunk_err(leaf, chunk, logical,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 200cea6e49e5..9b212e8c70cc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
 	btrfs_put_block_group(cache);
 }
 
+static int clean_log_buffer(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *eb)
+{
+	int ret;
+
+	btrfs_tree_lock(eb);
+	btrfs_clear_buffer_dirty(trans, eb);
+	wait_on_extent_buffer_writeback(eb);
+	btrfs_tree_unlock(eb);
+
+	if (trans) {
+		ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+		if (ret)
+			return ret;
+		btrfs_redirty_list_add(trans->transaction, eb);
+	} else {
+		unaccount_log_buffer(eb->fs_info, eb->start);
+	}
+
+	return 0;
+}
+
 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level,
@@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 	u64 ptr_gen;
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
-	u32 blocksize;
 	int ret = 0;
 
 	while (*level > 0) {
@@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		check.level = *level - 1;
 		check.has_first_key = true;
 		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
-		blocksize = fs_info->nodesize;
 
 		next = btrfs_find_create_tree_block(fs_info, bytenr,
 						    btrfs_header_owner(cur),
@@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 					return ret;
 				}
 
-				btrfs_tree_lock(next);
-				btrfs_clear_buffer_dirty(trans, next);
-				wait_on_extent_buffer_writeback(next);
-				btrfs_tree_unlock(next);
-
-				if (trans) {
-					ret = btrfs_pin_reserved_extent(trans,
-							bytenr, blocksize);
-					if (ret) {
-						free_extent_buffer(next);
-						return ret;
-					}
-					btrfs_redirty_list_add(
-						trans->transaction, next);
-				} else {
-					unaccount_log_buffer(fs_info, bytenr);
+				ret = clean_log_buffer(trans, next);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
 				}
 			}
 			free_extent_buffer(next);
@@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path, int *level,
 				 struct walk_control *wc)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int i;
 	int slot;
 	int ret;
@@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				return ret;
 
 			if (wc->free) {
-				struct extent_buffer *next;
-
-				next = path->nodes[*level];
-
-				btrfs_tree_lock(next);
-				btrfs_clear_buffer_dirty(trans, next);
-				wait_on_extent_buffer_writeback(next);
-				btrfs_tree_unlock(next);
-
-				if (trans) {
-					ret = btrfs_pin_reserved_extent(trans,
-						     path->nodes[*level]->start,
-						     path->nodes[*level]->len);
-					if (ret)
-						return ret;
-					btrfs_redirty_list_add(trans->transaction,
-							       next);
-				} else {
-					unaccount_log_buffer(fs_info,
-						path->nodes[*level]->start);
-				}
+				ret = clean_log_buffer(trans, path->nodes[*level]);
+				if (ret)
+					return ret;
 			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 static int walk_log_tree(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *log, struct walk_control *wc)
 {
-	struct btrfs_fs_info *fs_info = log->fs_info;
 	int ret = 0;
 	int wret;
 	int level;
@@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			 orig_level);
 		if (ret)
 			goto out;
-		if (wc->free) {
-			struct extent_buffer *next;
-
-			next = path->nodes[orig_level];
-
-			btrfs_tree_lock(next);
-			btrfs_clear_buffer_dirty(trans, next);
-			wait_on_extent_buffer_writeback(next);
-			btrfs_tree_unlock(next);
-
-			if (trans) {
-				ret = btrfs_pin_reserved_extent(trans,
-						next->start, next->len);
-				if (ret)
-					goto out;
-				btrfs_redirty_list_add(trans->transaction, next);
-			} else {
-				unaccount_log_buffer(fs_info, next->start);
-			}
-		}
+		if (wc->free)
+			ret = clean_log_buffer(trans, path->nodes[orig_level]);
 	}
 
 out:
@@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 		ret = BTRFS_LOG_FORCE_COMMIT;
 	else
 		inode->last_dir_index_offset = last_index;
+
+	if (btrfs_get_first_dir_index_to_log(inode) == 0)
+		btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
 out:
 	kfree(ins_data);
 
@@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
 
 		found_key.offset = 0;
 		found_key.type = 0;
-		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
+		ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
 		if (ret < 0)
 			break;
 
@@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 	LIST_HEAD(dir_list);
 	struct btrfs_dir_list *dir_elem;
 	u64 ino = btrfs_ino(start_inode);
+	struct btrfs_inode *curr_inode = start_inode;
 	int ret = 0;
 
 	/*
@@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	/* Pairs with btrfs_add_delayed_iput below. */
+	ihold(&curr_inode->vfs_inode);
+
 	while (true) {
-		struct extent_buffer *leaf;
-		struct btrfs_key min_key;
+		struct inode *vfs_inode;
+		struct btrfs_key key;
+		struct btrfs_key found_key;
+		u64 next_index;
 		bool continue_curr_inode = true;
-		int nritems;
-		int i;
+		int iter_ret;
 
-		min_key.objectid = ino;
-		min_key.type = BTRFS_DIR_INDEX_KEY;
-		min_key.offset = 0;
+		key.objectid = ino;
+		key.type = BTRFS_DIR_INDEX_KEY;
+		key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
+		next_index = key.offset;
 again:
-		btrfs_release_path(path);
-		ret = btrfs_search_forward(root, &min_key, path, trans->transid);
-		if (ret < 0) {
-			break;
-		} else if (ret > 0) {
-			ret = 0;
-			goto next;
-		}
-
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		for (i = path->slots[0]; i < nritems; i++) {
+		btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
+			struct extent_buffer *leaf = path->nodes[0];
 			struct btrfs_dir_item *di;
 			struct btrfs_key di_key;
 			struct inode *di_inode;
 			int log_mode = LOG_INODE_EXISTS;
 			int type;
 
-			btrfs_item_key_to_cpu(leaf, &min_key, i);
-			if (min_key.objectid != ino ||
-			    min_key.type != BTRFS_DIR_INDEX_KEY) {
+			if (found_key.objectid != ino ||
+			    found_key.type != BTRFS_DIR_INDEX_KEY) {
 				continue_curr_inode = false;
 				break;
 			}
 
-			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+			next_index = found_key.offset + 1;
+
+			di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 			type = btrfs_dir_ftype(leaf, di);
 			if (btrfs_dir_transid(leaf, di) < trans->transid)
 				continue;
@@ -5496,12 +5466,24 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 			break;
 		}
 
-		if (continue_curr_inode && min_key.offset < (u64)-1) {
-			min_key.offset++;
+		btrfs_release_path(path);
+
+		if (iter_ret < 0) {
+			ret = iter_ret;
+			goto out;
+		} else if (iter_ret > 0) {
+			continue_curr_inode = false;
+		} else {
+			key = found_key;
+		}
+
+		if (continue_curr_inode && key.offset < (u64)-1) {
+			key.offset++;
 			goto again;
 		}
 
-next:
+		btrfs_set_first_dir_index_to_log(curr_inode, next_index);
+
 		if (list_empty(&dir_list))
 			break;
 
@@ -5509,9 +5491,22 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 		ino = dir_elem->ino;
 		list_del(&dir_elem->list);
 		kfree(dir_elem);
+
+		btrfs_add_delayed_iput(curr_inode);
+		curr_inode = NULL;
+
+		vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+		if (IS_ERR(vfs_inode)) {
+			ret = PTR_ERR(vfs_inode);
+			break;
+		}
+		curr_inode = BTRFS_I(vfs_inode);
 	}
 out:
 	btrfs_free_path(path);
+	if (curr_inode)
+		btrfs_add_delayed_iput(curr_inode);
+
 	if (ret) {
 		struct btrfs_dir_list *next;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index db63f9da787f..03f52e4a20aa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device)
 {
 	WARN_ON(!list_empty(&device->post_commit_list));
 	rcu_string_free(device->name);
-	extent_io_tree_release(&device->alloc_state);
 	btrfs_destroy_dev_zone_info(device);
 	kfree(device);
 }
@@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
 	device->last_flush_error = 0;
 
 	/* Verify the device is back in a pristine state  */
-	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
-	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
-	ASSERT(list_empty(&device->dev_alloc_list));
-	ASSERT(list_empty(&device->post_commit_list));
+	WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+	WARN_ON(!list_empty(&device->dev_alloc_list));
+	WARN_ON(!list_empty(&device->post_commit_list));
 }
 
 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular(
 	/* We don't want a chunk larger than 10% of writable space */
 	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
 				  ctl->max_chunk_size);
-	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+	ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT;
 }
 
 static void init_alloc_chunk_ctl_policy_zoned(
@@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 						   j * ctl->stripe_size;
 		}
 	}
-	map->stripe_len = BTRFS_STRIPE_LEN;
 	map->io_align = BTRFS_STRIPE_LEN;
 	map->io_width = BTRFS_STRIPE_LEN;
 	map->type = type;
@@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 	}
 	write_unlock(&em_tree->lock);
 
-	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+	block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
 	if (IS_ERR(block_group))
 		goto error_del_extent;
 
@@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_stack_chunk_length(chunk, bg->length);
 	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
-	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
 	btrfs_set_stack_chunk_type(chunk, map->type);
 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
-	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
+	btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
 	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
 
@@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		 */
 		ret = map->num_stripes;
 	free_extent_map(em);
-
-	down_read(&fs_info->dev_replace.rwsem);
-	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
-	    fs_info->dev_replace.tgtdev)
-		ret++;
-	up_read(&fs_info->dev_replace.rwsem);
-
 	return ret;
 }
 
@@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	if (!WARN_ON(IS_ERR(em))) {
 		map = em->map_lookup;
 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-			len = map->stripe_len * nr_data_stripes(map);
+			len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
 		free_extent_map(em);
 	}
 	return len;
@@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	return preferred_mirror;
 }
 
-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
-{
-	int i;
-	int again = 1;
-
-	while (again) {
-		again = 0;
-		for (i = 0; i < num_stripes - 1; i++) {
-			/* Swap if parity is on a smaller index */
-			if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
-				swap(bioc->stripes[i], bioc->stripes[i + 1]);
-				swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
-				again = 1;
-			}
-		}
-	}
-}
-
 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
-						       int total_stripes,
-						       int real_stripes)
+						       u16 total_stripes)
 {
-	struct btrfs_io_context *bioc = kzalloc(
+	struct btrfs_io_context *bioc;
+
+	bioc = kzalloc(
 		 /* The size of btrfs_io_context */
 		sizeof(struct btrfs_io_context) +
 		/* Plus the variable array for the stripes */
-		sizeof(struct btrfs_io_stripe) * (total_stripes) +
-		/* Plus the variable array for the tgt dev */
-		sizeof(int) * (real_stripes) +
-		/*
-		 * Plus the raid_map, which includes both the tgt dev
-		 * and the stripes.
-		 */
-		sizeof(u64) * (total_stripes),
+		sizeof(struct btrfs_io_stripe) * (total_stripes),
 		GFP_NOFS);
 
 	if (!bioc)
@@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 	refcount_set(&bioc->refs, 1);
 
 	bioc->fs_info = fs_info;
-	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
-	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+	bioc->replace_stripe_src = -1;
+	bioc->full_stripe_logical = (u64)-1;
 
 	return bioc;
 }
@@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 	struct btrfs_discard_stripe *stripes;
 	u64 length = *length_ret;
 	u64 offset;
-	u64 stripe_nr;
-	u64 stripe_nr_end;
+	u32 stripe_nr;
+	u32 stripe_nr_end;
+	u32 stripe_cnt;
 	u64 stripe_end_offset;
-	u64 stripe_cnt;
-	u64 stripe_len;
 	u64 stripe_offset;
 	u32 stripe_index;
 	u32 factor = 0;
 	u32 sub_stripes = 0;
-	u64 stripes_per_dev = 0;
+	u32 stripes_per_dev = 0;
 	u32 remaining_stripes = 0;
 	u32 last_stripe = 0;
 	int ret;
@@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
 		ret = -EOPNOTSUPP;
 		goto out_free_map;
-}
+	}
 
 	offset = logical - em->start;
 	length = min_t(u64, em->start + em->len - logical, length);
 	*length_ret = length;
 
-	stripe_len = map->stripe_len;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	stripe_nr = div64_u64(offset, stripe_len);
+	stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
 
 	/* stripe_offset is the offset of this block in its stripe */
-	stripe_offset = offset - stripe_nr * stripe_len;
+	stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
 
-	stripe_nr_end = round_up(offset + length, map->stripe_len);
-	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+	stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
+			BTRFS_STRIPE_LEN_SHIFT;
 	stripe_cnt = stripe_nr_end - stripe_nr;
-	stripe_end_offset = stripe_nr_end * map->stripe_len -
+	stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) -
 			    (offset + length);
 	/*
 	 * after this, stripe_nr is the number of stripes on this
@@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 		factor = map->num_stripes / sub_stripes;
 		*num_stripes = min_t(u64, map->num_stripes,
 				    sub_stripes * stripe_cnt);
-		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+		stripe_index = stripe_nr % factor;
+		stripe_nr /= factor;
 		stripe_index *= sub_stripes;
-		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
-					      &remaining_stripes);
-		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
-		last_stripe *= sub_stripes;
+
+		remaining_stripes = stripe_cnt % factor;
+		stripes_per_dev = stripe_cnt / factor;
+		last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
 				BTRFS_BLOCK_GROUP_DUP)) {
 		*num_stripes = map->num_stripes;
 	} else {
-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
-					&stripe_index);
+		stripe_index = stripe_nr % map->num_stripes;
+		stripe_nr /= map->num_stripes;
 	}
 
 	stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
@@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 	for (i = 0; i < *num_stripes; i++) {
 		stripes[i].physical =
 			map->stripes[stripe_index].physical +
-			stripe_offset + stripe_nr * map->stripe_len;
+			stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
 		stripes[i].dev = map->stripes[stripe_index].dev;
 
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 				 BTRFS_BLOCK_GROUP_RAID10)) {
-			stripes[i].length = stripes_per_dev * map->stripe_len;
+			stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT;
 
 			if (i / sub_stripes < remaining_stripes)
-				stripes[i].length += map->stripe_len;
+				stripes[i].length += BTRFS_STRIPE_LEN;
 
 			/*
 			 * Special for the first stripe and
@@ -6103,83 +6068,6 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 	return ERR_PTR(ret);
 }
 
-/*
- * In dev-replace case, for repair case (that's the only case where the mirror
- * is selected explicitly when calling btrfs_map_block), blocks left of the
- * left cursor can also be read from the target drive.
- *
- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
- * array of stripes.
- * For READ, it also needs to be supported using the same mirror number.
- *
- * If the requested block is not left of the left cursor, EIO is returned. This
- * can happen because btrfs_num_copies() returns one more in the dev-replace
- * case.
- */
-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
-					 u64 logical, u64 length,
-					 u64 srcdev_devid, int *mirror_num,
-					 u64 *physical)
-{
-	struct btrfs_io_context *bioc = NULL;
-	int num_stripes;
-	int index_srcdev = 0;
-	int found = 0;
-	u64 physical_of_found = 0;
-	int i;
-	int ret = 0;
-
-	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-				logical, &length, &bioc, NULL, NULL, 0);
-	if (ret) {
-		ASSERT(bioc == NULL);
-		return ret;
-	}
-
-	num_stripes = bioc->num_stripes;
-	if (*mirror_num > num_stripes) {
-		/*
-		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
-		 * that means that the requested area is not left of the left
-		 * cursor
-		 */
-		btrfs_put_bioc(bioc);
-		return -EIO;
-	}
-
-	/*
-	 * process the rest of the function using the mirror_num of the source
-	 * drive. Therefore look it up first.  At the end, patch the device
-	 * pointer to the one of the target drive.
-	 */
-	for (i = 0; i < num_stripes; i++) {
-		if (bioc->stripes[i].dev->devid != srcdev_devid)
-			continue;
-
-		/*
-		 * In case of DUP, in order to keep it simple, only add the
-		 * mirror with the lowest physical address
-		 */
-		if (found &&
-		    physical_of_found <= bioc->stripes[i].physical)
-			continue;
-
-		index_srcdev = i;
-		found = 1;
-		physical_of_found = bioc->stripes[i].physical;
-	}
-
-	btrfs_put_bioc(bioc);
-
-	ASSERT(found);
-	if (!found)
-		return -EIO;
-
-	*mirror_num = index_srcdev + 1;
-	*physical = physical_of_found;
-	return ret;
-}
-
 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
 {
 	struct btrfs_block_group *cache;
@@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
 }
 
 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
-				      struct btrfs_io_context **bioc_ret,
+				      struct btrfs_io_context *bioc,
 				      struct btrfs_dev_replace *dev_replace,
 				      u64 logical,
 				      int *num_stripes_ret, int *max_errors_ret)
 {
-	struct btrfs_io_context *bioc = *bioc_ret;
 	u64 srcdev_devid = dev_replace->srcdev->devid;
-	int tgtdev_indexes = 0;
+	/*
+	 * At this stage, num_stripes is still the real number of stripes,
+	 * excluding the duplicated stripes.
+	 */
 	int num_stripes = *num_stripes_ret;
+	int nr_extra_stripes = 0;
 	int max_errors = *max_errors_ret;
 	int i;
 
-	if (op == BTRFS_MAP_WRITE) {
-		int index_where_to_add;
+	/*
+	 * A block group which has "to_copy" set will eventually be copied by
+	 * the dev-replace process. We can avoid cloning IO here.
+	 */
+	if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+		return;
 
-		/*
-		 * A block group which have "to_copy" set will eventually
-		 * copied by dev-replace process. We can avoid cloning IO here.
-		 */
-		if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
-			return;
+	/*
+	 * Duplicate the write operations while the dev-replace procedure is
+	 * running. Since the copying of the old disk to the new disk takes
+	 * place at run time while the filesystem is mounted writable, the
+	 * regular write operations to the old disk have to be duplicated to go
+	 * to the new disk as well.
+	 *
+	 * Note that device->missing is handled by the caller, and that the
+	 * write to the old disk is already set up in the stripes array.
+	 */
+	for (i = 0; i < num_stripes; i++) {
+		struct btrfs_io_stripe *old = &bioc->stripes[i];
+		struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
 
-		/*
-		 * duplicate the write operations while the dev replace
-		 * procedure is running. Since the copying of the old disk to
-		 * the new disk takes place at run time while the filesystem is
-		 * mounted writable, the regular write operations to the old
-		 * disk have to be duplicated to go to the new disk as well.
-		 *
-		 * Note that device->missing is handled by the caller, and that
-		 * the write to the old disk is already set up in the stripes
-		 * array.
-		 */
-		index_where_to_add = num_stripes;
-		for (i = 0; i < num_stripes; i++) {
-			if (bioc->stripes[i].dev->devid == srcdev_devid) {
-				/* write to new disk, too */
-				struct btrfs_io_stripe *new =
-					bioc->stripes + index_where_to_add;
-				struct btrfs_io_stripe *old =
-					bioc->stripes + i;
-
-				new->physical = old->physical;
-				new->dev = dev_replace->tgtdev;
-				bioc->tgtdev_map[i] = index_where_to_add;
-				index_where_to_add++;
-				max_errors++;
-				tgtdev_indexes++;
-			}
-		}
-		num_stripes = index_where_to_add;
-	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
-		int index_srcdev = 0;
-		int found = 0;
-		u64 physical_of_found = 0;
+		if (old->dev->devid != srcdev_devid)
+			continue;
 
-		/*
-		 * During the dev-replace procedure, the target drive can also
-		 * be used to read data in case it is needed to repair a corrupt
-		 * block elsewhere. This is possible if the requested area is
-		 * left of the left cursor. In this area, the target drive is a
-		 * full copy of the source drive.
-		 */
-		for (i = 0; i < num_stripes; i++) {
-			if (bioc->stripes[i].dev->devid == srcdev_devid) {
-				/*
-				 * In case of DUP, in order to keep it simple,
-				 * only add the mirror with the lowest physical
-				 * address
-				 */
-				if (found &&
-				    physical_of_found <= bioc->stripes[i].physical)
-					continue;
-				index_srcdev = i;
-				found = 1;
-				physical_of_found = bioc->stripes[i].physical;
-			}
-		}
-		if (found) {
-			struct btrfs_io_stripe *tgtdev_stripe =
-				bioc->stripes + num_stripes;
+		new->physical = old->physical;
+		new->dev = dev_replace->tgtdev;
+		if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			bioc->replace_stripe_src = i;
+		nr_extra_stripes++;
+	}
+
+	/* We can only have at most 2 extra nr_stripes (for DUP). */
+	ASSERT(nr_extra_stripes <= 2);
+	/*
+	 * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
+	 * replace.
+	 * If we have 2 extra stripes, only choose the one with smaller physical.
+	 */
+	if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+		struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
+		struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
 
-			tgtdev_stripe->physical = physical_of_found;
-			tgtdev_stripe->dev = dev_replace->tgtdev;
-			bioc->tgtdev_map[index_srcdev] = num_stripes;
+		/* Only DUP can have two extra stripes. */
+		ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
 
-			tgtdev_indexes++;
-			num_stripes++;
+		/*
+		 * Swap the last stripe stripes and reduce @nr_extra_stripes.
+		 * The extra stripe would still be there, but won't be accessed.
+		 */
+		if (first->physical > second->physical) {
+			swap(second->physical, first->physical);
+			swap(second->dev, first->dev);
+			nr_extra_stripes--;
 		}
 	}
 
-	*num_stripes_ret = num_stripes;
-	*max_errors_ret = max_errors;
-	bioc->num_tgtdevs = tgtdev_indexes;
-	*bioc_ret = bioc;
+	*num_stripes_ret = num_stripes + nr_extra_stripes;
+	*max_errors_ret = max_errors + nr_extra_stripes;
+	bioc->replace_nr_stripes = nr_extra_stripes;
 }
 
 static bool need_full_stripe(enum btrfs_map_op op)
@@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op)
 }
 
 static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
-			    u64 offset, u64 *stripe_nr, u64 *stripe_offset,
+			    u64 offset, u32 *stripe_nr, u64 *stripe_offset,
 			    u64 *full_stripe_start)
 {
-	u32 stripe_len = map->stripe_len;
-
 	ASSERT(op != BTRFS_MAP_DISCARD);
 
 	/*
 	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
 	 * the offset of this block in its stripe.
 	 */
-	*stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
+	*stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+	*stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
 	ASSERT(*stripe_offset < U32_MAX);
 
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+		unsigned long full_stripe_len = nr_data_stripes(map) <<
+						BTRFS_STRIPE_LEN_SHIFT;
 
+		/*
+		 * For full stripe start, we use previously calculated
+		 * @stripe_nr. Align it to nr_data_stripes, then multiply with
+		 * STRIPE_LEN.
+		 *
+		 * By this we can avoid u64 division completely.  And we have
+		 * to go rounddown(), not round_down(), as nr_data_stripes is
+		 * not ensured to be power of 2.
+		 */
 		*full_stripe_start =
-			div64_u64(offset, full_stripe_len) * full_stripe_len;
+			rounddown(*stripe_nr, nr_data_stripes(map)) <<
+			BTRFS_STRIPE_LEN_SHIFT;
 
 		/*
 		 * For writes to RAID56, allow to write a full stripe set, but
@@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
 	 * a single disk).
 	 */
 	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
-		return stripe_len - *stripe_offset;
+		return BTRFS_STRIPE_LEN - *stripe_offset;
 	return U64_MAX;
 }
 
 static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
-		          u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
 {
 	dst->dev = map->stripes[stripe_index].dev;
 	dst->physical = map->stripes[stripe_index].physical +
-			stripe_offset + stripe_nr * map->stripe_len;
+			stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
 }
 
 int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
@@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	struct map_lookup *map;
 	u64 map_offset;
 	u64 stripe_offset;
-	u64 stripe_nr;
-	u64 stripe_len;
+	u32 stripe_nr;
 	u32 stripe_index;
 	int data_stripes;
 	int i;
 	int ret = 0;
 	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
 	int num_stripes;
+	int num_copies;
 	int max_errors = 0;
-	int tgtdev_indexes = 0;
 	struct btrfs_io_context *bioc = NULL;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	int dev_replace_is_ongoing = 0;
-	int num_alloc_stripes;
-	int patch_the_first_stripe_for_dev_replace = 0;
-	u64 physical_to_patch_in_first_stripe = 0;
+	u16 num_alloc_stripes;
 	u64 raid56_full_stripe_start = (u64)-1;
 	u64 max_len;
 
 	ASSERT(bioc_ret);
 	ASSERT(op != BTRFS_MAP_DISCARD);
 
+	num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
+	if (mirror_num > num_copies)
+		return -EINVAL;
+
 	em = btrfs_get_chunk_map(fs_info, logical, *length);
 	if (IS_ERR(em))
 		return PTR_ERR(em);
 
 	map = em->map_lookup;
 	data_stripes = nr_data_stripes(map);
-	stripe_len = map->stripe_len;
 
 	map_offset = logical - em->start;
 	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
@@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	if (!dev_replace_is_ongoing)
 		up_read(&dev_replace->rwsem);
 
-	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
-	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
-		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
-						    dev_replace->srcdev->devid,
-						    &mirror_num,
-					    &physical_to_patch_in_first_stripe);
-		if (ret)
-			goto out;
-		else
-			patch_the_first_stripe_for_dev_replace = 1;
-	} else if (mirror_num > map->num_stripes) {
-		mirror_num = 0;
-	}
-
 	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
-				&stripe_index);
+		stripe_index = stripe_nr % map->num_stripes;
+		stripe_nr /= map->num_stripes;
 		if (!need_full_stripe(op))
 			mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
@@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		u32 factor = map->num_stripes / map->sub_stripes;
 
-		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
-		stripe_index *= map->sub_stripes;
+		stripe_index = (stripe_nr % factor) * map->sub_stripes;
+		stripe_nr /= factor;
 
 		if (need_full_stripe(op))
 			num_stripes = map->sub_stripes;
@@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
 		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
-			/* push stripe_nr back to the start of the full stripe */
-			stripe_nr = div64_u64(raid56_full_stripe_start,
-					stripe_len * data_stripes);
+			/*
+			 * Push stripe_nr back to the start of the full stripe
+			 * For those cases needing a full stripe, @stripe_nr
+			 * is the full stripe number.
+			 *
+			 * Originally we go raid56_full_stripe_start / full_stripe_len,
+			 * but that can be expensive.  Here we just divide
+			 * @stripe_nr with @data_stripes.
+			 */
+			stripe_nr /= data_stripes;
 
 			/* RAID[56] write or recovery. Return all stripes */
 			num_stripes = map->num_stripes;
@@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 			/* Return the length to the full stripe end */
 			*length = min(logical + *length,
 				      raid56_full_stripe_start + em->start +
-				      data_stripes * stripe_len) - logical;
+				      (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical;
 			stripe_index = 0;
 			stripe_offset = 0;
 		} else {
@@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 			 * Mirror #2 is RAID5 parity block.
 			 * Mirror #3 is RAID6 Q block.
 			 */
-			stripe_nr = div_u64_rem(stripe_nr,
-					data_stripes, &stripe_index);
+			stripe_index = stripe_nr % data_stripes;
+			stripe_nr /= data_stripes;
 			if (mirror_num > 1)
 				stripe_index = data_stripes + mirror_num - 2;
 
 			/* We distribute the parity blocks across stripes */
-			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
-					&stripe_index);
+			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
 			if (!need_full_stripe(op) && mirror_num <= 1)
 				mirror_num = 1;
 		}
 	} else {
 		/*
-		 * after this, stripe_nr is the number of stripes on this
+		 * After this, stripe_nr is the number of stripes on this
 		 * device we have to walk to find the data, and stripe_index is
 		 * the number of our device in the stripe array
 		 */
-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
-				&stripe_index);
+		stripe_index = stripe_nr % map->num_stripes;
+		stripe_nr /= map->num_stripes;
 		mirror_num = stripe_index + 1;
 	}
 	if (stripe_index >= map->num_stripes) {
@@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	}
 
 	num_alloc_stripes = num_stripes;
-	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
-		if (op == BTRFS_MAP_WRITE)
-			num_alloc_stripes <<= 1;
-		if (op == BTRFS_MAP_GET_READ_MIRRORS)
-			num_alloc_stripes++;
-		tgtdev_indexes = num_stripes;
-	}
+	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+	    op != BTRFS_MAP_READ)
+		/*
+		 * For replace case, we need to add extra stripes for extra
+		 * duplicated stripes.
+		 *
+		 * For both WRITE and GET_READ_MIRRORS, we may have at most
+		 * 2 more stripes (DUP types, otherwise 1).
+		 */
+		num_alloc_stripes += 2;
 
 	/*
 	 * If this I/O maps to a single device, try to return the device and
@@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
 	    (!need_full_stripe(op) || !dev_replace_is_ongoing ||
 	     !dev_replace->tgtdev)) {
-		if (patch_the_first_stripe_for_dev_replace) {
-			smap->dev = dev_replace->tgtdev;
-			smap->physical = physical_to_patch_in_first_stripe;
-			*mirror_num_ret = map->num_stripes + 1;
-		} else {
-			set_io_stripe(smap, map, stripe_index, stripe_offset,
-				      stripe_nr);
-			*mirror_num_ret = mirror_num;
-		}
+		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+		*mirror_num_ret = mirror_num;
 		*bioc_ret = NULL;
 		ret = 0;
 		goto out;
 	}
 
-	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
 	if (!bioc) {
 		ret = -ENOMEM;
 		goto out;
 	}
+	bioc->map_type = map->type;
 
-	for (i = 0; i < num_stripes; i++) {
-		set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
-			      stripe_nr);
-		stripe_index++;
-	}
-
-	/* Build raid_map */
+	/*
+	 * For RAID56 full map, we need to make sure the stripes[] follows the
+	 * rule that data stripes are all ordered, then followed with P and Q
+	 * (if we have).
+	 *
+	 * It's still mostly the same as other profiles, just with extra rotation.
+	 */
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
 	    (need_full_stripe(op) || mirror_num > 1)) {
-		u64 tmp;
-		unsigned rot;
-
-		/* Work out the disk rotation on this stripe-set */
-		div_u64_rem(stripe_nr, num_stripes, &rot);
-
-		/* Fill in the logical address of each stripe */
-		tmp = stripe_nr * data_stripes;
-		for (i = 0; i < data_stripes; i++)
-			bioc->raid_map[(i + rot) % num_stripes] =
-				em->start + (tmp + i) * map->stripe_len;
-
-		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
-		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-			bioc->raid_map[(i + rot + 1) % num_stripes] =
-				RAID6_Q_STRIPE;
-
-		sort_parity_stripes(bioc, num_stripes);
+		/*
+		 * For RAID56 @stripe_nr is already the number of full stripes
+		 * before us, which is also the rotation value (needs to modulo
+		 * with num_stripes).
+		 *
+		 * In this case, we just add @stripe_nr with @i, then do the
+		 * modulo, to reduce one modulo call.
+		 */
+		bioc->full_stripe_logical = em->start +
+			((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
+		for (i = 0; i < num_stripes; i++)
+			set_io_stripe(&bioc->stripes[i], map,
+				      (i + stripe_nr) % num_stripes,
+				      stripe_offset, stripe_nr);
+	} else {
+		/*
+		 * For all other non-RAID56 profiles, just copy the target
+		 * stripe into the bioc.
+		 */
+		for (i = 0; i < num_stripes; i++) {
+			set_io_stripe(&bioc->stripes[i], map, stripe_index,
+				      stripe_offset, stripe_nr);
+			stripe_index++;
+		}
 	}
 
 	if (need_full_stripe(op))
@@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 
 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
 	    need_full_stripe(op)) {
-		handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
+		handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
 					  &num_stripes, &max_errors);
 	}
 
 	*bioc_ret = bioc;
-	bioc->map_type = map->type;
 	bioc->num_stripes = num_stripes;
 	bioc->max_errors = max_errors;
 	bioc->mirror_num = mirror_num;
 
-	/*
-	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
-	 * mirror_num == num_stripes + 1 && dev_replace target drive is
-	 * available as a mirror
-	 */
-	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
-		WARN_ON(num_stripes > 1);
-		bioc->stripes[0].dev = dev_replace->tgtdev;
-		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
-		bioc->mirror_num = map->num_stripes + 1;
-	}
 out:
 	if (dev_replace_is_ongoing) {
 		lockdep_assert_held(&dev_replace->rwsem);
@@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
-	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	map->type = type;
 	/*
 	 * We can't use the sub_stripes value, as for profiles other than
@@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
 
 	return true;
 }
+
+static void map_raid56_repair_block(struct btrfs_io_context *bioc,
+				    struct btrfs_io_stripe *smap,
+				    u64 logical)
+{
+	int data_stripes = nr_bioc_data_stripes(bioc);
+	int i;
+
+	for (i = 0; i < data_stripes; i++) {
+		u64 stripe_start = bioc->full_stripe_logical +
+				   (i << BTRFS_STRIPE_LEN_SHIFT);
+
+		if (logical >= stripe_start &&
+		    logical < stripe_start + BTRFS_STRIPE_LEN)
+			break;
+	}
+	ASSERT(i < data_stripes);
+	smap->dev = bioc->stripes[i].dev;
+	smap->physical = bioc->stripes[i].physical +
+			((logical - bioc->full_stripe_logical) &
+			 BTRFS_STRIPE_LEN_MASK);
+}
+
+/*
+ * Map a repair write into a single device.
+ *
+ * A repair write is triggered by read time repair or scrub, which would only
+ * update the contents of a single device.
+ * Not update any other mirrors nor go through RMW path.
+ *
+ * Callers should ensure:
+ *
+ * - Call btrfs_bio_counter_inc_blocked() first
+ * - The range does not cross stripe boundary
+ * - Has a valid @mirror_num passed in.
+ */
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+			   struct btrfs_io_stripe *smap, u64 logical,
+			   u32 length, int mirror_num)
+{
+	struct btrfs_io_context *bioc = NULL;
+	u64 map_length = length;
+	int mirror_ret = mirror_num;
+	int ret;
+
+	ASSERT(mirror_num > 0);
+
+	ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
+				&bioc, smap, &mirror_ret, true);
+	if (ret < 0)
+		return ret;
+
+	/* The map range should not cross stripe boundary. */
+	ASSERT(map_length >= length);
+
+	/* Already mapped to single stripe. */
+	if (!bioc)
+		goto out;
+
+	/* Map the RAID56 multi-stripe writes to a single one. */
+	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		map_raid56_repair_block(bioc, smap, logical);
+		goto out;
+	}
+
+	ASSERT(mirror_num <= bioc->num_stripes);
+	smap->dev = bioc->stripes[mirror_num - 1].dev;
+	smap->physical = bioc->stripes[mirror_num - 1].physical;
+out:
+	btrfs_put_bioc(bioc);
+	ASSERT(smap->dev);
+	return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7e51f2238f72..bf47a1a70813 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,7 +17,11 @@
 
 extern struct mutex uuid_mutex;
 
-#define BTRFS_STRIPE_LEN	SZ_64K
+#define BTRFS_STRIPE_LEN		SZ_64K
+#define BTRFS_STRIPE_LEN_SHIFT		(16)
+#define BTRFS_STRIPE_LEN_MASK		(BTRFS_STRIPE_LEN - 1)
+
+static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
 
 /* Used by sanity check for btrfs_raid_types. */
 #define const_ffs(n) (__builtin_ctzll(n) + 1)
@@ -404,17 +408,74 @@ struct btrfs_io_context {
 	u64 map_type; /* get from map_lookup->type */
 	struct bio *orig_bio;
 	atomic_t error;
-	int max_errors;
-	int num_stripes;
-	int mirror_num;
-	int num_tgtdevs;
-	int *tgtdev_map;
+	u16 max_errors;
+
+	/*
+	 * The total number of stripes, including the extra duplicated
+	 * stripe for replace.
+	 */
+	u16 num_stripes;
+
+	/*
+	 * The mirror_num of this bioc.
+	 *
+	 * This is for reads which use 0 as mirror_num, thus we should return a
+	 * valid mirror_num (>0) for the reader.
+	 */
+	u16 mirror_num;
+
+	/*
+	 * The following two members are for dev-replace case only.
+	 *
+	 * @replace_nr_stripes:	Number of duplicated stripes which need to be
+	 *			written to replace target.
+	 *			Should be <= 2 (2 for DUP, otherwise <= 1).
+	 * @replace_stripe_src:	The array indicates where the duplicated stripes
+	 *			are from.
+	 *
+	 * The @replace_stripe_src[] array is mostly for RAID56 cases.
+	 * As non-RAID56 stripes share the same contents of the mapped range,
+	 * thus no need to bother where the duplicated ones are from.
+	 *
+	 * But for RAID56 case, all stripes contain different contents, thus
+	 * we need a way to know the mapping.
+	 *
+	 * There is an example for the two members, using a RAID5 write:
+	 *
+	 *   num_stripes:	4 (3 + 1 duplicated write)
+	 *   stripes[0]:	dev = devid 1, physical = X
+	 *   stripes[1]:	dev = devid 2, physical = Y
+	 *   stripes[2]:	dev = devid 3, physical = Z
+	 *   stripes[3]:	dev = devid 0, physical = Y
+	 *
+	 * replace_nr_stripes = 1
+	 * replace_stripe_src = 1	<- Means stripes[1] is involved in replace.
+	 *				   The duplicated stripe index would be
+	 *				   (@num_stripes - 1).
+	 *
+	 * Note, that we can still have cases replace_nr_stripes = 2 for DUP.
+	 * In that case, all stripes share the same content, thus we don't
+	 * need to bother @replace_stripe_src value at all.
+	 */
+	u16 replace_nr_stripes;
+	s16 replace_stripe_src;
 	/*
-	 * logical block numbers for the start of each stripe
-	 * The last one or two are p/q.  These are sorted,
-	 * so raid_map[0] is the start of our full stripe
+	 * Logical bytenr of the full stripe start, only for RAID56 cases.
+	 *
+	 * When this value is set to other than (u64)-1, the stripes[] should
+	 * follow this pattern:
+	 *
+	 * (real_stripes = num_stripes - replace_nr_stripes)
+	 * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1))
+	 *
+	 * stripes[0]:			The first data stripe
+	 * stripes[1]:			The second data stripe
+	 * ...
+	 * stripes[data_stripes - 1]:	The last data stripe
+	 * stripes[data_stripes]:	The P stripe
+	 * stripes[data_stripes + 1]:	The Q stripe (only for RAID6).
 	 */
-	u64 *raid_map;
+	u64 full_stripe_logical;
 	struct btrfs_io_stripe stripes[];
 };
 
@@ -446,7 +507,6 @@ struct map_lookup {
 	u64 type;
 	int io_align;
 	int io_width;
-	u32 stripe_len;
 	int num_stripes;
 	int sub_stripes;
 	int verified_stripes; /* For mount time dev extent verification */
@@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		      struct btrfs_io_context **bioc_ret,
 		      struct btrfs_io_stripe *smap, int *mirror_num_ret,
 		      int need_raid_map);
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+			   struct btrfs_io_stripe *smap, u64 logical,
+			   u32 length, int mirror_num);
 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 					       u64 logical, u64 *length_ret,
 					       u32 *num_stripes);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index da7bb9187b68..8acb05e176c5 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -350,8 +350,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	zlib_inflateEnd(&workspace->strm);
 	if (data_in)
 		kunmap_local(data_in);
-	if (!ret)
-		zero_fill_bio(cb->orig_bio);
 	return ret;
 }
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 8db47f93007f..39828af4a4e8 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1639,14 +1639,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
 {
 	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
 	struct btrfs_inode *inode = bbio->inode;
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct btrfs_block_group *cache;
 	bool ret = false;
 
 	if (!btrfs_is_zoned(fs_info))
 		return false;
 
-	if (!is_data_inode(&inode->vfs_inode))
+	if (!inode || !is_data_inode(&inode->vfs_inode))
 		return false;
 
 	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e34f1ab99d56..f798da267590 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		}
 	}
 	ret = 0;
-	zero_fill_bio(cb->orig_bio);
 done:
 	if (workspace->in_buf.src)
 		kunmap_local(workspace->in_buf.src);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 203f16c48c19..abfd2acb3053 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1491,6 +1491,7 @@ STATIC int
 xfs_alloc_ag_vextent_near(
 	struct xfs_alloc_arg	*args)
 {
+	int			flags = args->flags | XFS_ALLOC_FLAG_TRYFLUSH;
 	struct xfs_alloc_cur	acur = {};
 	int			error;		/* error code */
 	int			i;		/* result code, temporary */
@@ -1564,8 +1565,11 @@ xfs_alloc_ag_vextent_near(
 	if (!acur.len) {
 		if (acur.busy) {
 			trace_xfs_alloc_near_busy(args);
-			xfs_extent_busy_flush(args->mp, args->pag,
-					      acur.busy_gen);
+			error = xfs_extent_busy_flush(args->tp, args->pag,
+					      acur.busy_gen, flags);
+			if (error)
+				goto out;
+			flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
 			goto restart;
 		}
 		trace_xfs_alloc_size_neither(args);
@@ -1592,6 +1596,7 @@ STATIC int				/* error */
 xfs_alloc_ag_vextent_size(
 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
 {
+	int		flags = args->flags | XFS_ALLOC_FLAG_TRYFLUSH;
 	struct xfs_agf	*agf = args->agbp->b_addr;
 	struct xfs_btree_cur *bno_cur;	/* cursor for bno btree */
 	struct xfs_btree_cur *cnt_cur;	/* cursor for cnt btree */
@@ -1670,8 +1675,13 @@ xfs_alloc_ag_vextent_size(
 				xfs_btree_del_cursor(cnt_cur,
 						     XFS_BTREE_NOERROR);
 				trace_xfs_alloc_size_busy(args);
-				xfs_extent_busy_flush(args->mp,
-							args->pag, busy_gen);
+				error = xfs_extent_busy_flush(args->tp, args->pag,
+						busy_gen, flags);
+				if (error) {
+					cnt_cur = NULL;
+					goto error0;
+				}
+				flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
 				goto restart;
 			}
 		}
@@ -1755,7 +1765,13 @@ xfs_alloc_ag_vextent_size(
 		if (busy) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 			trace_xfs_alloc_size_busy(args);
-			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
+			error = xfs_extent_busy_flush(args->tp, args->pag,
+					busy_gen, flags);
+			if (error) {
+				cnt_cur = NULL;
+				goto error0;
+			}
+			flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
 			goto restart;
 		}
 		goto out_nominleft;
@@ -2629,6 +2645,7 @@ xfs_alloc_fix_freelist(
 	targs.agno = args->agno;
 	targs.alignment = targs.minlen = targs.prod = 1;
 	targs.pag = pag;
+	targs.flags = args->flags & XFS_ALLOC_FLAG_FREEING;
 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
 	if (error)
 		goto out_agbp_relse;
@@ -3572,6 +3589,7 @@ xfs_free_extent_fix_freelist(
 	args.mp = tp->t_mountp;
 	args.agno = pag->pag_agno;
 	args.pag = pag;
+	args.flags = XFS_ALLOC_FLAG_FREEING;
 
 	/*
 	 * validate that the block number is legal - the enables us to detect
@@ -3580,7 +3598,7 @@ xfs_free_extent_fix_freelist(
 	if (args.agno >= args.mp->m_sb.sb_agcount)
 		return -EFSCORRUPTED;
 
-	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	error = xfs_alloc_fix_freelist(&args, args.flags);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2b246d74c189..5038fba87784 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -24,6 +24,7 @@ unsigned int xfs_agfl_size(struct xfs_mount *mp);
 #define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
 #define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
 #define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
+#define	XFS_ALLOC_FLAG_TRYFLUSH	0x00000020  /* don't block in busyextent flush*/
 
 /*
  * Argument structure for xfs_alloc routines.
@@ -57,6 +58,7 @@ typedef struct xfs_alloc_arg {
 #ifdef DEBUG
 	bool		alloc_minlen_only; /* allocate exact minlen extent */
 #endif
+	int		flags;		/* XFS_ALLOC_FLAG_* */
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b71174ec0d6..2ba28e4257fe 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -496,9 +496,9 @@ xrep_fix_freelist(
 	args.agno = sc->sa.pag->pag_agno;
 	args.alignment = 1;
 	args.pag = sc->sa.pag;
+	args.flags = can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK;
 
-	return xfs_alloc_fix_freelist(&args,
-			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+	return xfs_alloc_fix_freelist(&args, args.flags);
 }
 
 /*
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index f3d328e4a440..ea1c1857bf5b 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -567,18 +567,41 @@ xfs_extent_busy_clear(
 /*
  * Flush out all busy extents for this AG.
  */
-void
+int
 xfs_extent_busy_flush(
-	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
 	struct xfs_perag	*pag,
-	unsigned		busy_gen)
+	unsigned		busy_gen,
+	int			flags)
 {
 	DEFINE_WAIT		(wait);
 	int			error;
 
-	error = xfs_log_force(mp, XFS_LOG_SYNC);
+	error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
 	if (error)
-		return;
+		return error;
+
+	/*
+	 * If we are holding busy extents, the caller may not want to block
+	 * straight away. If we are being told just to try a flush or progress
+	 * has been made since we last skipped a busy extent, return
+	 * immediately to allow the caller to try again. If we are freeing
+	 * extents, we might actually be holding the only free extents in the
+	 * transaction busy list and the log force won't resolve that
+	 * situation. In this case, return -EAGAIN in that case to tell the
+	 * caller it needs to commit the busy extents it holds before retrying
+	 * the extent free operation.
+	 */
+	if (!list_empty(&tp->t_busy)) {
+		if (flags & XFS_ALLOC_FLAG_TRYFLUSH)
+			return 0;
+
+		if (busy_gen != READ_ONCE(pag->pagb_gen))
+			return 0;
+
+		if (flags & XFS_ALLOC_FLAG_FREEING)
+			return -EAGAIN;
+	}
 
 	do {
 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
@@ -588,6 +611,7 @@ xfs_extent_busy_flush(
 	} while (1);
 
 	finish_wait(&pag->pagb_wait, &wait);
+	return 0;
 }
 
 void
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 4a118131059f..edeedb92e0df 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -51,9 +51,9 @@ bool
 xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
 		xfs_extlen_t *len, unsigned *busy_gen);
 
-void
-xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
-	unsigned busy_gen);
+int
+xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag,
+	unsigned busy_gen, int flags);
 
 void
 xfs_extent_busy_wait_all(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 011b50469301..3c5a9e9952ec 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -336,6 +336,25 @@ xfs_trans_get_efd(
 	return efdp;
 }
 
+/*
+ * Fill the EFD with all extents from the EFI and set the counter.
+ * Note: the EFD should comtain at least one extents already.
+ */
+static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
+{
+	struct xfs_efi_log_item *efip = efdp->efd_efip;
+	uint                    i;
+
+	if (efdp->efd_next_extent == efip->efi_format.efi_nextents)
+		return;
+
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+	       efdp->efd_format.efd_extents[i] =
+		       efip->efi_format.efi_extents[i];
+	}
+	efdp->efd_next_extent = efip->efi_format.efi_nextents;
+}
+
 /*
  * Free an extent and log it to the EFD. Note that the transaction is marked
  * dirty regardless of whether the extent free succeeds or fails to support the
@@ -369,6 +388,10 @@ xfs_trans_free_extent(
 	error = __xfs_free_extent(tp, xefi->xefi_startblock,
 			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
 			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+	if (error == -EAGAIN) {
+		xfs_fill_efd_with_efi(efdp);
+		return error;
+	}
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
 	 * transaction is aborted, which:
@@ -476,7 +499,8 @@ xfs_extent_free_finish_item(
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 
 	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	if (error != -EAGAIN)
+		kmem_cache_free(xfs_extfree_item_cache, xefi);
 	return error;
 }
 
@@ -633,6 +657,17 @@ xfs_efi_item_recover(
 		fake.xefi_blockcount = extp->ext_len;
 
 		error = xfs_trans_free_extent(tp, efdp, &fake);
+		if (error == -EAGAIN) {
+			xfs_free_extent_later(tp, fake.xefi_startblock,
+				fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER);
+			/*
+			 * try to free as many extents as possible with current
+			 * transaction
+			 */
+			error = 0;
+			continue;
+		};
+
 		if (error == -EFSCORRUPTED)
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 22c13933c8f8..2fc98d313708 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -78,7 +78,6 @@ xfs_filestream_pick_ag(
 		*longest = 0;
 		err = xfs_bmap_longest_free_extent(pag, NULL, longest);
 		if (err) {
-			xfs_perag_rele(pag);
 			if (err != -EAGAIN)
 				break;
 			/* Couldn't lock the AGF, skip this AG. */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c9a7e270a428..54d97687f6b3 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -435,18 +435,23 @@ xfs_iget_check_free_state(
 }
 
 /* Make all pending inactivation work start immediately. */
-static void
+static bool
 xfs_inodegc_queue_all(
 	struct xfs_mount	*mp)
 {
 	struct xfs_inodegc	*gc;
 	int			cpu;
+	bool			ret = false;
 
 	for_each_online_cpu(cpu) {
 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
-		if (!llist_empty(&gc->list))
+		if (!llist_empty(&gc->list)) {
 			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+			ret = true;
+		}
 	}
+
+	return ret;
 }
 
 /*
@@ -1908,24 +1913,41 @@ xfs_inodegc_flush(
 
 /*
  * Flush all the pending work and then disable the inode inactivation background
- * workers and wait for them to stop.
+ * workers and wait for them to stop.  Caller must hold sb->s_umount to
+ * coordinate changes in the inodegc_enabled state.
  */
 void
 xfs_inodegc_stop(
 	struct xfs_mount	*mp)
 {
+	bool			rerun;
+
 	if (!xfs_clear_inodegc_enabled(mp))
 		return;
 
+	/*
+	 * Drain all pending inodegc work, including inodes that could be
+	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
+	 * threads that sample the inodegc state just prior to us clearing it.
+	 * The inodegc flag state prevents new threads from queuing more
+	 * inodes, so we queue pending work items and flush the workqueue until
+	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
+	 * here because it does not allow other unserialized mechanisms to
+	 * reschedule inodegc work while this draining is in progress.
+	 */
 	xfs_inodegc_queue_all(mp);
-	drain_workqueue(mp->m_inodegc_wq);
+	do {
+		flush_workqueue(mp->m_inodegc_wq);
+		rerun = xfs_inodegc_queue_all(mp);
+	} while (rerun);
 
 	trace_xfs_inodegc_stop(mp, __return_address);
 }
 
 /*
  * Enable the inode inactivation background workers and schedule deferred inode
- * inactivation work if there is any.
+ * inactivation work if there is any.  Caller must hold sb->s_umount to
+ * coordinate changes in the inodegc_enabled state.
  */
 void
 xfs_inodegc_start(
@@ -2068,7 +2090,8 @@ xfs_inodegc_queue(
 		queue_delay = 0;
 
 	trace_xfs_inodegc_queue(mp, __return_address);
-	mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
+			queue_delay);
 	put_cpu_ptr(gc);
 
 	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
@@ -2112,7 +2135,8 @@ xfs_inodegc_cpu_dead(
 
 	if (xfs_is_inodegc_enabled(mp)) {
 		trace_xfs_inodegc_queue(mp, __return_address);
-		mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
+		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
+				0);
 	}
 	put_cpu_ptr(gc);
 }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 322eb2ee6c55..00bfe9683fa8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2540,30 +2540,27 @@ xlog_recover_process_intents(
 	struct xfs_log_item	*lip;
 	struct xfs_ail		*ailp;
 	int			error = 0;
-#if defined(DEBUG) || defined(XFS_WARN)
-	xfs_lsn_t		last_lsn;
-#endif
+	xfs_lsn_t		threshold_lsn;
 
 	ailp = log->l_ailp;
+	threshold_lsn = xfs_ail_max_lsn(ailp);
 	spin_lock(&ailp->ail_lock);
-#if defined(DEBUG) || defined(XFS_WARN)
-	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
-#endif
+
 	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
 	     lip != NULL;
 	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
 		const struct xfs_item_ops	*ops;
+		/*
+		 * Orignal redo EFI could be splitted into new EFIs. Those
+		 * new EFIs are supposed to be processed in capture_list.
+		 * Stop here when original redo intents are done.
+		 */
+		if (XFS_LSN_CMP(threshold_lsn, lip->li_lsn) < 0)
+			break;
 
 		if (!xlog_item_is_intent(lip))
 			break;
 
-		/*
-		 * We should never see a redo item with a LSN higher than
-		 * the last transaction we found in the log at the start
-		 * of recovery.
-		 */
-		ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
-
 		/*
 		 * NOTE: If your intent processing routine can create more
 		 * deferred ops, you /must/ attach them to the capture list in
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 7d4109af193e..2825f55eca88 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -137,7 +137,7 @@ xfs_ail_min_lsn(
 /*
  * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
  */
-static xfs_lsn_t
+xfs_lsn_t
 xfs_ail_max_lsn(
 	struct xfs_ail		*ailp)
 {
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index d5400150358e..86b4f29b2a6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,6 +106,7 @@ void			xfs_ail_push_all(struct xfs_ail *);
 void			xfs_ail_push_all_sync(struct xfs_ail *);
 struct xfs_log_item	*xfs_ail_min(struct xfs_ail  *ailp);
 xfs_lsn_t		xfs_ail_min_lsn(struct xfs_ail *ailp);
+xfs_lsn_t		xfs_ail_max_lsn(struct xfs_ail *ailp);
 
 struct xfs_log_item *	xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
 					struct xfs_ail_cursor *cur,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 7b9a0ed1b11f..1ab1f6cf7ca5 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -212,9 +212,7 @@ __xfs_xattr_put_listent(
 	offset = context->buffer + context->count;
 	memcpy(offset, prefix, prefix_len);
 	offset += prefix_len;
-	strncpy(offset, (char *)name, namelen);			/* real name */
-	offset += namelen;
-	*offset = '\0';
+	strscpy(offset, (char *)name, namelen + 1);			/* real name */
 
 compute_size:
 	context->count += prefix_len + namelen + 1;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d766be7152e1..b3e7529ff55e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -500,6 +500,7 @@ void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
 void bio_clone_blkg_association(struct bio *dst, struct bio *src);
+void blkcg_punt_bio_submit(struct bio *bio);
 #else	/* CONFIG_BLK_CGROUP */
 static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
@@ -507,6 +508,10 @@ static inline void bio_associate_blkg_from_css(struct bio *bio,
 { }
 static inline void bio_clone_blkg_association(struct bio *dst,
 					      struct bio *src) { }
+static inline void blkcg_punt_bio_submit(struct bio *bio)
+{
+	submit_bio(bio);
+}
 #endif	/* CONFIG_BLK_CGROUP */
 
 static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 99be590f952f..fb8843990d28 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -404,18 +404,11 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
-	/*
-	 * When a shared kthread needs to issue a bio for a cgroup, doing
-	 * so synchronously can lead to priority inversions as the kthread
-	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
-	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
-	 * work item to avoid such priority inversions.
-	 */
-	__REQ_CGROUP_PUNT,
 	__REQ_POLLED,		/* caller polls for completion using bio_poll */
 	__REQ_ALLOC_CACHE,	/* allocate IO from cache if available */
 	__REQ_SWAP,		/* swap I/O */
 	__REQ_DRV,		/* for driver use */
+	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 
 	/*
 	 * Command specific flags, keep last:
@@ -443,14 +436,13 @@ enum req_flag_bits {
 #define REQ_RAHEAD	(__force blk_opf_t)(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND	(__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT	(__force blk_opf_t)(1ULL << __REQ_NOWAIT)
-#define REQ_CGROUP_PUNT	(__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT)
-
-#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 #define REQ_POLLED	(__force blk_opf_t)(1ULL << __REQ_POLLED)
 #define REQ_ALLOC_CACHE	(__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
-
-#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_SWAP	(__force blk_opf_t)(1ULL << __REQ_SWAP)
+#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
+#define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
+
+#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index bd21af828ff6..357ae4611a45 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 
 extern u32 crc32c(u32 crc, const void *address, unsigned int length);
-extern const char *crc32c_impl(void);
 
 /* This macro exists for backwards-compatibility. */
 #define crc32c_le crc32c
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 46020373e155..fba937999fbf 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -70,8 +70,6 @@ struct writeback_control {
 	 */
 	unsigned no_cgroup_owner:1;
 
-	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
-
 	/* To enable batching of swap writes to non-block-device backends,
 	 * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
 	 * writes have been submitted, if with swap_iocb is not NULL,
@@ -97,9 +95,6 @@ static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
 {
 	blk_opf_t flags = 0;
 
-	if (wbc->punt_to_cgroup)
-		flags = REQ_CGROUP_PUNT;
-
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		flags |= REQ_SYNC;
 	else if (wbc->for_kupdate || wbc->for_background)
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 75d7d22c3a27..8ea9cea9bfeb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2422,7 +2422,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio,
 	),
 
 	TP_fast_assign_btrfs(rbio->bioc->fs_info,
-		__entry->full_stripe	= rbio->bioc->raid_map[0];
+		__entry->full_stripe	= rbio->bioc->full_stripe_logical;
 		__entry->physical	= bio->bi_iter.bi_sector << SECTOR_SHIFT;
 		__entry->len		= bio->bi_iter.bi_size;
 		__entry->opf		= bio_op(bio);
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 5ca0d815a95d..649e687413a0 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -65,12 +65,6 @@ static void __exit libcrc32c_mod_fini(void)
 	crypto_free_shash(tfm);
 }
 
-const char *crc32c_impl(void)
-{
-	return crypto_shash_driver_name(tfm);
-}
-EXPORT_SYMBOL(crc32c_impl);
-
 module_init(libcrc32c_mod_init);
 module_exit(libcrc32c_mod_fini);
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f62b85832ae0..ca55caf3cd63 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"__reiserfs_panic",
 		"__stack_chk_fail",
 		"__ubsan_handle_builtin_unreachable",
+		"btrfs_assertfail",
 		"cpu_bringup_and_idle",
 		"cpu_startup_entry",
 		"do_exit",
-- 
2.41.0

From fc201fdbe85c6e376c997f8b285d1ab894e05943 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Wed, 14 Jun 2023 19:31:49 +0200
Subject: [PATCH 06/12] Implement amd-pstate guided driver

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .../admin-guide/kernel-parameters.txt         |  40 ++--
 Documentation/admin-guide/pm/amd-pstate.rst   |  31 ++-
 drivers/acpi/cppc_acpi.c                      | 121 ++++++++++-
 drivers/cpufreq/amd-pstate.c                  | 200 ++++++++++++------
 drivers/cpufreq/cpufreq.c                     |   3 +-
 include/acpi/cppc_acpi.h                      |  11 +
 include/linux/amd-pstate.h                    |   2 +
 include/linux/cpufreq.h                       |   5 +-
 8 files changed, 319 insertions(+), 94 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 97303fa40350..dddaba21a9a7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -339,6 +339,29 @@
 			             This mode requires kvm-amd.avic=1.
 			             (Default when IOMMU HW support is present.)
 
+	amd_pstate=	[X86]
+			disable
+			  Do not enable amd_pstate as the default
+			  scaling driver for the supported processors
+			passive
+			  Use amd_pstate with passive mode as a scaling driver.
+			  In this mode autonomous selection is disabled.
+			  Driver requests a desired performance level and platform
+			  tries to match the same performance level if it is
+			  satisfied by guaranteed performance level.
+			active
+			  Use amd_pstate_epp driver instance as the scaling driver,
+			  driver provides a hint to the hardware if software wants
+			  to bias toward performance (0x0) or energy efficiency (0xff)
+			  to the CPPC firmware. then CPPC power algorithm will
+			  calculate the runtime workload and adjust the realtime cores
+			  frequency.
+			guided
+			  Activate guided autonomous mode. Driver requests minimum and
+			  maximum performance level and the platform autonomously
+			  selects a performance level in this range and appropriate
+			  to the current workload.
+
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
 			Format: <a>,<b>
@@ -7077,20 +7100,3 @@
 				xmon commands.
 			off	xmon is disabled.
 
-	amd_pstate=	[X86]
-			disable
-			  Do not enable amd_pstate as the default
-			  scaling driver for the supported processors
-			passive
-			  Use amd_pstate as a scaling driver, driver requests a
-			  desired performance on this abstract scale and the power
-			  management firmware translates the requests into actual
-			  hardware states (core frequency, data fabric and memory
-			  clocks etc.)
-			active
-			  Use amd_pstate_epp driver instance as the scaling driver,
-			  driver provides a hint to the hardware if software wants
-			  to bias toward performance (0x0) or energy efficiency (0xff)
-			  to the CPPC firmware. then CPPC power algorithm will
-			  calculate the runtime workload and adjust the realtime cores
-			  frequency.
diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index 6e5298b521b1..1cf40f69278c 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -303,13 +303,18 @@ efficiency frequency management method on AMD processors.
 AMD Pstate Driver Operation Modes
 =================================
 
-``amd_pstate`` CPPC has two operation modes: CPPC Autonomous(active) mode and
-CPPC non-autonomous(passive) mode.
-active mode and passive mode can be chosen by different kernel parameters.
-When in Autonomous mode, CPPC ignores requests done in the Desired Performance
-Target register and takes into account only the values set to the Minimum requested
-performance, Maximum requested performance, and Energy Performance Preference
-registers. When Autonomous is disabled, it only considers the Desired Performance Target.
+``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
+non-autonomous (passive) mode and guided autonomous (guided) mode.
+Active/passive/guided mode can be chosen by different kernel parameters.
+
+- In autonomous mode, platform ignores the desired performance level request
+  and takes into account only the values set to the minimum, maximum and energy
+  performance preference registers.
+- In non-autonomous mode, platform gets desired performance level
+  from OS directly through Desired Performance Register.
+- In guided-autonomous mode, platform sets operating performance level
+  autonomously according to the current workload and within the limits set by
+  OS through min and max performance registers.
 
 Active Mode
 ------------
@@ -338,6 +343,15 @@ to the Performance Reduction Tolerance register. Above the nominal performance l
 processor must provide at least nominal performance requested and go higher if current
 operating conditions allow.
 
+Guided Mode
+-----------
+
+``amd_pstate=guided``
+
+If ``amd_pstate=guided`` is passed to kernel command line option then this mode
+is activated.  In this mode, driver requests minimum and maximum performance
+level and the platform autonomously selects a performance level in this range
+and appropriate to the current workload.
 
 User Space Interface in ``sysfs`` - General
 ===========================================
@@ -358,6 +372,9 @@ control its functionality at the system level.  They are located in the
 	"passive"
 		The driver is functional and in the ``passive mode``
 
+	"guided"
+		The driver is functional and in the ``guided mode``
+
 	"disable"
 		The driver is unregistered and not functional now.
 
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index c51d3ccb4cca..02a4bfb54967 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1433,6 +1433,103 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
 }
 EXPORT_SYMBOL_GPL(cppc_set_epp_perf);
 
+/*
+ * cppc_get_auto_sel_caps - Read autonomous selection register.
+ * @cpunum : CPU from which to read register.
+ * @perf_caps : struct where autonomous selection register value is updated.
+ */
+int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
+{
+	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
+	struct cpc_register_resource *auto_sel_reg;
+	u64  auto_sel;
+
+	if (!cpc_desc) {
+		pr_debug("No CPC descriptor for CPU:%d\n", cpunum);
+		return -ENODEV;
+	}
+
+	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
+
+	if (!CPC_SUPPORTED(auto_sel_reg))
+		pr_warn_once("Autonomous mode is not unsupported!\n");
+
+	if (CPC_IN_PCC(auto_sel_reg)) {
+		int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
+		struct cppc_pcc_data *pcc_ss_data = NULL;
+		int ret = 0;
+
+		if (pcc_ss_id < 0)
+			return -ENODEV;
+
+		pcc_ss_data = pcc_data[pcc_ss_id];
+
+		down_write(&pcc_ss_data->pcc_lock);
+
+		if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) {
+			cpc_read(cpunum, auto_sel_reg, &auto_sel);
+			perf_caps->auto_sel = (bool)auto_sel;
+		} else {
+			ret = -EIO;
+		}
+
+		up_write(&pcc_ss_data->pcc_lock);
+
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps);
+
+/*
+ * cppc_set_auto_sel - Write autonomous selection register.
+ * @cpu    : CPU to which to write register.
+ * @enable : the desired value of autonomous selection resiter to be updated.
+ */
+int cppc_set_auto_sel(int cpu, bool enable)
+{
+	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
+	struct cpc_register_resource *auto_sel_reg;
+	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
+	struct cppc_pcc_data *pcc_ss_data = NULL;
+	int ret = -EINVAL;
+
+	if (!cpc_desc) {
+		pr_debug("No CPC descriptor for CPU:%d\n", cpu);
+		return -ENODEV;
+	}
+
+	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
+
+	if (CPC_IN_PCC(auto_sel_reg)) {
+		if (pcc_ss_id < 0) {
+			pr_debug("Invalid pcc_ss_id\n");
+			return -ENODEV;
+		}
+
+		if (CPC_SUPPORTED(auto_sel_reg)) {
+			ret = cpc_write(cpu, auto_sel_reg, enable);
+			if (ret)
+				return ret;
+		}
+
+		pcc_ss_data = pcc_data[pcc_ss_id];
+
+		down_write(&pcc_ss_data->pcc_lock);
+		/* after writing CPC, transfer the ownership of PCC to platform */
+		ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
+		up_write(&pcc_ss_data->pcc_lock);
+	} else {
+		ret = -ENOTSUPP;
+		pr_debug("_CPC in PCC is not supported\n");
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cppc_set_auto_sel);
+
+
 /**
  * cppc_set_enable - Set to enable CPPC on the processor by writing the
  * Continuous Performance Control package EnableRegister field.
@@ -1488,7 +1585,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable);
 int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 {
 	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
-	struct cpc_register_resource *desired_reg;
+	struct cpc_register_resource *desired_reg, *min_perf_reg, *max_perf_reg;
 	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
 	struct cppc_pcc_data *pcc_ss_data = NULL;
 	int ret = 0;
@@ -1499,6 +1596,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 	}
 
 	desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF];
+	min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF];
+	max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF];
 
 	/*
 	 * This is Phase-I where we want to write to CPC registers
@@ -1507,7 +1606,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 	 * Since read_lock can be acquired by multiple CPUs simultaneously we
 	 * achieve that goal here
 	 */
-	if (CPC_IN_PCC(desired_reg)) {
+	if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) {
 		if (pcc_ss_id < 0) {
 			pr_debug("Invalid pcc_ss_id\n");
 			return -ENODEV;
@@ -1530,13 +1629,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 		cpc_desc->write_cmd_status = 0;
 	}
 
-	/*
-	 * Skip writing MIN/MAX until Linux knows how to come up with
-	 * useful values.
-	 */
 	cpc_write(cpu, desired_reg, perf_ctrls->desired_perf);
 
-	if (CPC_IN_PCC(desired_reg))
+	/**
+	 * Only write if min_perf and max_perf not zero. Some drivers pass zero
+	 * value to min and max perf, but they don't mean to set the zero value,
+	 * they just don't want to write to those registers.
+	 */
+	if (perf_ctrls->min_perf)
+		cpc_write(cpu, min_perf_reg, perf_ctrls->min_perf);
+	if (perf_ctrls->max_perf)
+		cpc_write(cpu, max_perf_reg, perf_ctrls->max_perf);
+
+	if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg))
 		up_read(&pcc_ss_data->pcc_lock);	/* END Phase-I */
 	/*
 	 * This is Phase-II where we transfer the ownership of PCC to Platform
@@ -1584,7 +1689,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 	 * case during a CMD_READ and if there are pending writes it delivers
 	 * the write command before servicing the read command
 	 */
-	if (CPC_IN_PCC(desired_reg)) {
+	if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) {
 		if (down_write_trylock(&pcc_ss_data->pcc_lock)) {/* BEGIN Phase-II */
 			/* Update only if there are pending write commands */
 			if (pcc_ss_data->pending_pcc_write_cmd)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index a7eb25066c27..ddd346a239e0 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -63,7 +63,6 @@ static struct cpufreq_driver *current_pstate_driver;
 static struct cpufreq_driver amd_pstate_driver;
 static struct cpufreq_driver amd_pstate_epp_driver;
 static int cppc_state = AMD_PSTATE_DISABLE;
-struct kobject *amd_pstate_kobj;
 
 /*
  * AMD Energy Preference Performance (EPP)
@@ -106,6 +105,8 @@ static unsigned int epp_values[] = {
 	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
  };
 
+typedef int (*cppc_mode_transition_fn)(int);
+
 static inline int get_mode_idx_from_str(const char *str, size_t size)
 {
 	int i;
@@ -308,7 +309,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
 		   cppc_perf.lowest_nonlinear_perf);
 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
 
-	return 0;
+	if (cppc_state == AMD_PSTATE_ACTIVE)
+		return 0;
+
+	ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf);
+	if (ret) {
+		pr_warn("failed to get auto_sel, ret: %d\n", ret);
+		return 0;
+	}
+
+	ret = cppc_set_auto_sel(cpudata->cpu,
+			(cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
+
+	if (ret)
+		pr_warn("failed to set auto_sel, ret: %d\n", ret);
+
+	return ret;
 }
 
 DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
@@ -385,12 +401,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
 }
 
 static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
-			      u32 des_perf, u32 max_perf, bool fast_switch)
+			      u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags)
 {
 	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
 	u64 value = prev;
 
 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
+
+	if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
+		min_perf = des_perf;
+		des_perf = 0;
+	}
+
 	value &= ~AMD_CPPC_MIN_PERF(~0L);
 	value |= AMD_CPPC_MIN_PERF(min_perf);
 
@@ -451,7 +473,8 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy,
 	if (!fast_switch)
 		cpufreq_freq_transition_begin(policy, &freqs);
 
-	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, fast_switch);
+	amd_pstate_update(cpudata, min_perf, des_perf,
+			max_perf, fast_switch, policy->governor->flags);
 
 	if (!fast_switch)
 		cpufreq_freq_transition_end(policy, &freqs, false);
@@ -506,7 +529,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 	target_freq = div_u64(des_perf * max_freq, max_perf);
 	policy->cur = target_freq;
 
-	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true);
+	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
+			policy->governor->flags);
 	cpufreq_cpu_put(policy);
 }
 
@@ -844,6 +868,98 @@ static ssize_t show_energy_performance_preference(
 	return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
 }
 
+static void amd_pstate_driver_cleanup(void)
+{
+	amd_pstate_enable(false);
+	cppc_state = AMD_PSTATE_DISABLE;
+	current_pstate_driver = NULL;
+}
+
+static int amd_pstate_register_driver(int mode)
+{
+	int ret;
+
+	if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
+		current_pstate_driver = &amd_pstate_driver;
+	else if (mode == AMD_PSTATE_ACTIVE)
+		current_pstate_driver = &amd_pstate_epp_driver;
+	else
+		return -EINVAL;
+
+	cppc_state = mode;
+	ret = cpufreq_register_driver(current_pstate_driver);
+	if (ret) {
+		amd_pstate_driver_cleanup();
+		return ret;
+	}
+	return 0;
+}
+
+static int amd_pstate_unregister_driver(int dummy)
+{
+	cpufreq_unregister_driver(current_pstate_driver);
+	amd_pstate_driver_cleanup();
+	return 0;
+}
+
+static int amd_pstate_change_mode_without_dvr_change(int mode)
+{
+	int cpu = 0;
+
+	cppc_state = mode;
+
+	if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE)
+		return 0;
+
+	for_each_present_cpu(cpu) {
+		cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
+	}
+
+	return 0;
+}
+
+static int amd_pstate_change_driver_mode(int mode)
+{
+	int ret;
+
+	ret = amd_pstate_unregister_driver(0);
+	if (ret)
+		return ret;
+
+	ret = amd_pstate_register_driver(mode);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static cppc_mode_transition_fn mode_state_machine[AMD_PSTATE_MAX][AMD_PSTATE_MAX] = {
+	[AMD_PSTATE_DISABLE]         = {
+		[AMD_PSTATE_DISABLE]     = NULL,
+		[AMD_PSTATE_PASSIVE]     = amd_pstate_register_driver,
+		[AMD_PSTATE_ACTIVE]      = amd_pstate_register_driver,
+		[AMD_PSTATE_GUIDED]      = amd_pstate_register_driver,
+	},
+	[AMD_PSTATE_PASSIVE]         = {
+		[AMD_PSTATE_DISABLE]     = amd_pstate_unregister_driver,
+		[AMD_PSTATE_PASSIVE]     = NULL,
+		[AMD_PSTATE_ACTIVE]      = amd_pstate_change_driver_mode,
+		[AMD_PSTATE_GUIDED]      = amd_pstate_change_mode_without_dvr_change,
+	},
+	[AMD_PSTATE_ACTIVE]          = {
+		[AMD_PSTATE_DISABLE]     = amd_pstate_unregister_driver,
+		[AMD_PSTATE_PASSIVE]     = amd_pstate_change_driver_mode,
+		[AMD_PSTATE_ACTIVE]      = NULL,
+		[AMD_PSTATE_GUIDED]      = amd_pstate_change_driver_mode,
+	},
+	[AMD_PSTATE_GUIDED]          = {
+		[AMD_PSTATE_DISABLE]     = amd_pstate_unregister_driver,
+		[AMD_PSTATE_PASSIVE]     = amd_pstate_change_mode_without_dvr_change,
+		[AMD_PSTATE_ACTIVE]      = amd_pstate_change_driver_mode,
+		[AMD_PSTATE_GUIDED]      = NULL,
+	},
+};
+
 static ssize_t amd_pstate_show_status(char *buf)
 {
 	if (!current_pstate_driver)
@@ -852,55 +968,22 @@ static ssize_t amd_pstate_show_status(char *buf)
 	return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
 }
 
-static void amd_pstate_driver_cleanup(void)
-{
-	current_pstate_driver = NULL;
-}
-
 static int amd_pstate_update_status(const char *buf, size_t size)
 {
-	int ret = 0;
 	int mode_idx;
 
-	if (size > 7 || size < 6)
+	if (size > strlen("passive") || size < strlen("active"))
 		return -EINVAL;
+
 	mode_idx = get_mode_idx_from_str(buf, size);
 
-	switch(mode_idx) {
-	case AMD_PSTATE_DISABLE:
-		if (current_pstate_driver) {
-			cpufreq_unregister_driver(current_pstate_driver);
-			amd_pstate_driver_cleanup();
-		}
-		break;
-	case AMD_PSTATE_PASSIVE:
-		if (current_pstate_driver) {
-			if (current_pstate_driver == &amd_pstate_driver)
-				return 0;
-			cpufreq_unregister_driver(current_pstate_driver);
-		}
+	if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
+		return -EINVAL;
 
-		current_pstate_driver = &amd_pstate_driver;
-		cppc_state = AMD_PSTATE_PASSIVE;
-		ret = cpufreq_register_driver(current_pstate_driver);
-		break;
-	case AMD_PSTATE_ACTIVE:
-		if (current_pstate_driver) {
-			if (current_pstate_driver == &amd_pstate_epp_driver)
-				return 0;
-			cpufreq_unregister_driver(current_pstate_driver);
-		}
+	if (mode_state_machine[cppc_state][mode_idx])
+		return mode_state_machine[cppc_state][mode_idx](mode_idx);
 
-		current_pstate_driver = &amd_pstate_epp_driver;
-		cppc_state = AMD_PSTATE_ACTIVE;
-		ret = cpufreq_register_driver(current_pstate_driver);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
+	return 0;
 }
 
 static ssize_t show_status(struct kobject *kobj,
@@ -958,6 +1041,7 @@ static struct attribute *pstate_global_attributes[] = {
 };
 
 static const struct attribute_group amd_pstate_global_attr_group = {
+	.name = "amd_pstate",
 	.attrs = pstate_global_attributes,
 };
 
@@ -1278,6 +1362,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
 
 static int __init amd_pstate_init(void)
 {
+	struct device *dev_root;
 	int ret;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
@@ -1304,7 +1389,7 @@ static int __init amd_pstate_init(void)
 	/* capability check */
 	if (boot_cpu_has(X86_FEATURE_CPPC)) {
 		pr_debug("AMD CPPC MSR based functionality is supported\n");
-		if (cppc_state == AMD_PSTATE_PASSIVE)
+		if (cppc_state != AMD_PSTATE_ACTIVE)
 			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
 	} else {
 		pr_debug("AMD CPPC shared memory based functionality is supported\n");
@@ -1324,24 +1409,19 @@ static int __init amd_pstate_init(void)
 	if (ret)
 		pr_err("failed to register with return %d\n", ret);
 
-	amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj);
-	if (!amd_pstate_kobj) {
-		ret = -EINVAL;
-		pr_err("global sysfs registration failed.\n");
-		goto kobject_free;
-	}
-
-	ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group);
-	if (ret) {
-		pr_err("sysfs attribute export failed with error %d.\n", ret);
-		goto global_attr_free;
+	dev_root = bus_get_dev_root(&cpu_subsys);
+	if (dev_root) {
+		ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group);
+		put_device(dev_root);
+		if (ret) {
+			pr_err("sysfs attribute export failed with error %d.\n", ret);
+			goto global_attr_free;
+		}
 	}
 
 	return ret;
 
 global_attr_free:
-	kobject_put(amd_pstate_kobj);
-kobject_free:
 	cpufreq_unregister_driver(current_pstate_driver);
 	return ret;
 }
@@ -1366,7 +1446,7 @@ static int __init amd_pstate_param(char *str)
 		if (cppc_state == AMD_PSTATE_ACTIVE)
 			current_pstate_driver = &amd_pstate_epp_driver;
 
-		if (cppc_state == AMD_PSTATE_PASSIVE)
+		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
 			current_pstate_driver = &amd_pstate_driver;
 
 		return 0;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index a0742c787014..a575dfa4b66c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2823,7 +2823,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	     (driver_data->setpolicy && (driver_data->target_index ||
 		    driver_data->target)) ||
 	     (!driver_data->get_intermediate != !driver_data->target_intermediate) ||
-	     (!driver_data->online != !driver_data->offline))
+	     (!driver_data->online != !driver_data->offline) ||
+		 (driver_data->adjust_perf && !driver_data->fast_switch))
 		return -EINVAL;
 
 	pr_debug("trying to register driver %s\n", driver_data->name);
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 6b487a5bd638..6126c977ece0 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -109,6 +109,7 @@ struct cppc_perf_caps {
 	u32 lowest_freq;
 	u32 nominal_freq;
 	u32 energy_perf;
+	bool auto_sel;
 };
 
 struct cppc_perf_ctrls {
@@ -153,6 +154,8 @@ extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val);
 extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val);
 extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
 extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
+extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
+extern int cppc_set_auto_sel(int cpu, bool enable);
 #else /* !CONFIG_ACPI_CPPC_LIB */
 static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
 {
@@ -214,6 +217,14 @@ static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
 {
 	return -ENOTSUPP;
 }
+static inline int cppc_set_auto_sel(int cpu, bool enable)
+{
+	return -ENOTSUPP;
+}
+static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
+{
+	return -ENOTSUPP;
+}
 #endif /* !CONFIG_ACPI_CPPC_LIB */
 
 #endif /* _CPPC_ACPI_H*/
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index f5f22418e64b..c10ebf8c42e6 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -97,6 +97,7 @@ enum amd_pstate_mode {
 	AMD_PSTATE_DISABLE = 0,
 	AMD_PSTATE_PASSIVE,
 	AMD_PSTATE_ACTIVE,
+	AMD_PSTATE_GUIDED,
 	AMD_PSTATE_MAX,
 };
 
@@ -104,6 +105,7 @@ static const char * const amd_pstate_mode_string[] = {
 	[AMD_PSTATE_DISABLE]     = "disable",
 	[AMD_PSTATE_PASSIVE]     = "passive",
 	[AMD_PSTATE_ACTIVE]      = "active",
+	[AMD_PSTATE_GUIDED]      = "guided",
 	NULL,
 };
 #endif /* _LINUX_AMD_PSTATE_H */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 65623233ab2f..b5d734b0fb28 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -340,7 +340,10 @@ struct cpufreq_driver {
 	/*
 	 * ->fast_switch() replacement for drivers that use an internal
 	 * representation of performance levels and can pass hints other than
-	 * the target performance level to the hardware.
+	 * the target performance level to the hardware. This can only be set
+	 * if ->fast_switch is set too, because in those cases (under specific
+	 * conditions) scale invariance can be disabled, which causes the
+	 * schedutil governor to fall back to the latter.
 	 */
 	void		(*adjust_perf)(unsigned int cpu,
 				       unsigned long min_perf,
-- 
2.41.0

From b8494f3bb33800c959246a7736931d7d2b8fc345 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Mon, 17 Apr 2023 18:28:52 +0200
Subject: [PATCH 07/12] ksm

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 include/linux/ksm.h                         |   4 +
 include/linux/syscalls.h                    |   1 +
 include/uapi/asm-generic/unistd.h           |   5 +-
 kernel/sys_ni.c                             |   1 +
 mm/ksm.c                                    |  82 +++++++++-----
 mm/madvise.c                                | 117 ++++++++++++++++++++
 24 files changed, 199 insertions(+), 31 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 8ebacf37a8cf..c9d25f85d86d 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -490,3 +490,4 @@
 558	common	process_mrelease		sys_process_mrelease
 559	common  futex_waitv                     sys_futex_waitv
 560	common	set_mempolicy_home_node		sys_ni_syscall
+561	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index ac964612d8b0..90933eabe115 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -464,3 +464,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 037feba03a51..64a514f90131 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		451
+#define __NR_compat_syscalls		452
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 604a2053d006..91f2bb7199af 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+#define __NR_pmadv_ksm 451
+__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 72c929d9902b..0d5b1d14b2b5 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -371,3 +371,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index b1f3940bc298..5ccf925567da 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -450,3 +450,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 820145e47350..6b76208597f3 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 253ff994ed2e..e4aeedb17c38 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -389,3 +389,4 @@
 448	n32	process_mrelease		sys_process_mrelease
 449	n32	futex_waitv			sys_futex_waitv
 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	n32	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 3f1886ad9d80..fe88db51efa0 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -365,3 +365,4 @@
 448	n64	process_mrelease		sys_process_mrelease
 449	n64	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	n64	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 8f243e35a7b2..674cb940bd15 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -438,3 +438,4 @@
 448	o32	process_mrelease		sys_process_mrelease
 449	o32	futex_waitv			sys_futex_waitv
 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	o32	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 0e42fceb2d5e..5914aa460255 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -448,3 +448,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index a0be127475b1..347894da4eb6 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -537,3 +537,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 799147658dee..1cd523748bd2 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
 448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
 449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
+451  common	pmadv_ksm		sys_pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 2de85c977f54..cfc75fa43eae 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4398cc6fb68d..d2c0a6426f6b 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -496,3 +496,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 320480a8db4f..331aaf1a782f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -455,3 +455,4 @@
 448	i386	process_mrelease	sys_process_mrelease
 449	i386	futex_waitv		sys_futex_waitv
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	i386	pmadv_ksm		sys_pmadv_ksm
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..14902db4c01f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@
 448	common	process_mrelease	sys_process_mrelease
 449	common	futex_waitv		sys_futex_waitv
 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
+451	common	pmadv_ksm		sys_pmadv_ksm
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 52c94ab5c205..1518e261d882 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -421,3 +421,4 @@
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
+451	common	pmadv_ksm			sys_pmadv_ksm
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7e232ba59b86..57ed92987717 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -16,6 +16,10 @@
 #include <linux/sched/coredump.h>
 
 #ifdef CONFIG_KSM
+int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
+		const vm_flags_t *vm_flags);
+int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, const vm_flags_t *vm_flags);
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
 int __ksm_enter(struct mm_struct *mm);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 33a0ee3bcb2e..62f14e800839 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -919,6 +919,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
 			size_t vlen, int behavior, unsigned int flags);
 asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
+asmlinkage long sys_pmadv_ksm(int pidfd, int behavior, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..40f7e6d04af0 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 
+#define __NR_pmadv_ksm 451
+__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm)
+
 #undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..810e1fcaff94 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -292,6 +292,7 @@ COND_SYSCALL(mincore);
 COND_SYSCALL(madvise);
 COND_SYSCALL(process_madvise);
 COND_SYSCALL(process_mrelease);
+COND_SYSCALL(pmadv_ksm);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
 COND_SYSCALL(get_mempolicy);
diff --git a/mm/ksm.c b/mm/ksm.c
index 2b8d30068cbb..ab9a157873f4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2471,52 +2471,76 @@ static int ksm_scan_thread(void *nothing)
 	return 0;
 }
 
-int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end, int advice, unsigned long *vm_flags)
+int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
+		const vm_flags_t *vm_flags)
 {
-	struct mm_struct *mm = vma->vm_mm;
 	int err;
 
-	switch (advice) {
-	case MADV_MERGEABLE:
-		/*
-		 * Be somewhat over-protective for now!
-		 */
-		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
-				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_MIXEDMAP))
-			return 0;		/* just ignore the advice */
+	/*
+	 * Be somewhat over-protective for now!
+	 */
+	if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
+			 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
+			 VM_HUGETLB | VM_MIXEDMAP))
+		return 0;		/* just ignore the advice */
 
-		if (vma_is_dax(vma))
-			return 0;
+	if (vma_is_dax(vma))
+		return 0;
 
 #ifdef VM_SAO
 		if (*vm_flags & VM_SAO)
 			return 0;
 #endif
 #ifdef VM_SPARC_ADI
-		if (*vm_flags & VM_SPARC_ADI)
-			return 0;
+	if (*vm_flags & VM_SPARC_ADI)
+		return 0;
 #endif
 
-		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
-			err = __ksm_enter(mm);
-			if (err)
-				return err;
-		}
+	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+		err = __ksm_enter(mm);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, const vm_flags_t *vm_flags)
+{
+	int err;
+
+	if (!(*vm_flags & VM_MERGEABLE))
+		return 0;		/* just ignore the advice */
+
+	if (vma->anon_vma) {
+		err = unmerge_ksm_pages(vma, start, end);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, int advice, unsigned long *vm_flags)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	int err;
+
+	switch (advice) {
+	case MADV_MERGEABLE:
+		err = ksm_madvise_merge(mm, vma, vm_flags);
+		if (err)
+			return err;
 
 		*vm_flags |= VM_MERGEABLE;
 		break;
 
 	case MADV_UNMERGEABLE:
-		if (!(*vm_flags & VM_MERGEABLE))
-			return 0;		/* just ignore the advice */
-
-		if (vma->anon_vma) {
-			err = unmerge_ksm_pages(vma, start, end);
-			if (err)
-				return err;
-		}
+		err = ksm_madvise_unmerge(vma, start, end, vm_flags);
+		if (err)
+			return err;
 
 		*vm_flags &= ~VM_MERGEABLE;
 		break;
diff --git a/mm/madvise.c b/mm/madvise.c
index 340125d08c03..36e756355f04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1522,3 +1522,120 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 out:
 	return ret;
 }
+
+SYSCALL_DEFINE3(pmadv_ksm, int, pidfd, int, behaviour, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	ssize_t ret;
+	struct pid *pid;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int f_flags;
+	struct vm_area_struct *vma;
+	struct vma_iterator vmi;
+
+	if (flags != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	switch (behaviour) {
+		case MADV_MERGEABLE:
+		case MADV_UNMERGEABLE:
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+			break;
+	}
+
+	pid = pidfd_get_pid(pidfd, &f_flags);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto out;
+	}
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		ret = -ESRCH;
+		goto put_pid;
+	}
+
+	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+	if (IS_ERR_OR_NULL(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		goto release_task;
+	}
+
+	/* Require CAP_SYS_NICE for influencing process performance. */
+	if (!capable(CAP_SYS_NICE)) {
+		ret = -EPERM;
+		goto release_mm;
+	}
+
+	if (mmap_write_lock_killable(mm)) {
+		ret = -EINTR;
+		goto release_mm;
+	}
+
+	vma_iter_init(&vmi, mm, 0);
+	for_each_vma(vmi, vma) {
+		switch (behaviour) {
+			case MADV_MERGEABLE:
+				ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
+				if (!ret)
+					vm_flags_set(vma, VM_MERGEABLE);
+				break;
+			case MADV_UNMERGEABLE:
+				ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
+				if (!ret)
+					vm_flags_clear(vma, VM_MERGEABLE);
+				break;
+			default:
+				/* look, ma, no brain */
+				break;
+		}
+		if (ret)
+			break;
+	}
+
+	mmap_write_unlock(mm);
+
+release_mm:
+	mmput(mm);
+release_task:
+	put_task_struct(task);
+put_pid:
+	put_pid(pid);
+out:
+	return ret;
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+#ifdef CONFIG_KSM
+static ssize_t ksm_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_pmadv_ksm);
+}
+static struct kobj_attribute pmadv_ksm_attr = __ATTR_RO(ksm);
+
+static struct attribute *pmadv_sysfs_attrs[] = {
+	&pmadv_ksm_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group pmadv_sysfs_attr_group = {
+	.attrs = pmadv_sysfs_attrs,
+	.name = "pmadv",
+};
+
+static int __init pmadv_sysfs_init(void)
+{
+	return sysfs_create_group(kernel_kobj, &pmadv_sysfs_attr_group);
+}
+subsys_initcall(pmadv_sysfs_init);
+#endif /* CONFIG_KSM */
-- 
2.41.0

From 743ab3b7a7cffc8436cf2bbd9db4849b56fbae80 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Fri, 9 Jun 2023 20:06:28 +0200
Subject: [PATCH 08/12] kvm-lru

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/admin-guide/mm/multigen_lru.rst |   6 +-
 arch/arm64/include/asm/kvm_host.h             |   6 +
 arch/arm64/include/asm/kvm_pgtable.h          |  55 +++++++
 arch/arm64/kvm/arm.c                          |   1 +
 arch/arm64/kvm/hyp/pgtable.c                  |  61 +-------
 arch/arm64/kvm/mmu.c                          |  53 ++++++-
 arch/powerpc/include/asm/kvm_host.h           |   8 +
 arch/powerpc/include/asm/kvm_ppc.h            |   1 +
 arch/powerpc/kvm/book3s.c                     |   6 +
 arch/powerpc/kvm/book3s.h                     |   1 +
 arch/powerpc/kvm/book3s_64_mmu_radix.c        |  65 +++++++-
 arch/powerpc/kvm/book3s_hv.c                  |   5 +
 arch/x86/include/asm/kvm_host.h               |  13 ++
 arch/x86/kvm/mmu.h                            |   6 -
 arch/x86/kvm/mmu/spte.h                       |   1 -
 arch/x86/kvm/mmu/tdp_mmu.c                    |  34 +++++
 include/linux/kvm_host.h                      |  22 +++
 include/linux/mmu_notifier.h                  |  79 ++++++----
 include/linux/mmzone.h                        |   6 +-
 include/trace/events/kvm.h                    |  15 --
 mm/mmu_notifier.c                             |  48 ++----
 mm/rmap.c                                     |   8 +-
 mm/vmscan.c                                   | 139 ++++++++++++++++--
 virt/kvm/kvm_main.c                           | 114 ++++++++------
 24 files changed, 546 insertions(+), 207 deletions(-)

diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst
index 33e068830497..0ae2a6d4d94c 100644
--- a/Documentation/admin-guide/mm/multigen_lru.rst
+++ b/Documentation/admin-guide/mm/multigen_lru.rst
@@ -48,6 +48,10 @@ Values Components
        verified on x86 varieties other than Intel and AMD. If it is
        disabled, the multi-gen LRU will suffer a negligible
        performance degradation.
+0x0008 Clearing the accessed bit in KVM page table entries in large
+       batches, when KVM MMU sets it (e.g., on x86_64). This can
+       improve the performance of guests when the host is under memory
+       pressure.
 [yYnN] Apply to all the components above.
 ====== ===============================================================
 
@@ -56,7 +60,7 @@ E.g.,
 
     echo y >/sys/kernel/mm/lru_gen/enabled
     cat /sys/kernel/mm/lru_gen/enabled
-    0x0007
+    0x000f
     echo 5 >/sys/kernel/mm/lru_gen/enabled
     cat /sys/kernel/mm/lru_gen/enabled
     0x0005
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 77d941ad3aed..5cc3c0a2c181 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1094,4 +1094,10 @@ static inline void kvm_hyp_reserve(void) { }
 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
 
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return cpu_has_hw_af() && !is_protected_kvm_enabled();
+}
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 93bd0975b15f..ad7cae0d8f3a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -44,6 +44,49 @@ typedef u64 kvm_pte_t;
 
 #define KVM_PHYS_INVALID		(-1ULL)
 
+#define KVM_PTE_TYPE			BIT(1)
+#define KVM_PTE_TYPE_BLOCK		0
+#define KVM_PTE_TYPE_PAGE		1
+#define KVM_PTE_TYPE_TABLE		1
+
+#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
+#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
+#define KVM_MAX_OWNER_ID		1
+
+/*
+ * Used to indicate a pte for which a 'break-before-make' sequence is in
+ * progress.
+ */
+#define KVM_INVALID_PTE_LOCKED		BIT(10)
+
 static inline bool kvm_pte_valid(kvm_pte_t pte)
 {
 	return pte & KVM_PTE_VALID;
@@ -110,6 +153,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
  * @put_page:			Decrement the refcount on a page. When the
  *				refcount reaches 0 the page is automatically
  *				freed.
+ * @put_page_rcu:		RCU variant of the above.
  * @page_count:			Return the refcount of a page.
  * @phys_to_virt:		Convert a physical address into a virtual
  *				address	mapped in the current context.
@@ -127,6 +171,7 @@ struct kvm_pgtable_mm_ops {
 	void		(*free_removed_table)(void *addr, u32 level);
 	void		(*get_page)(void *addr);
 	void		(*put_page)(void *addr);
+	void		(*put_page_rcu)(void *addr);
 	int		(*page_count)(void *addr);
 	void*		(*phys_to_virt)(phys_addr_t phys);
 	phys_addr_t	(*virt_to_phys)(void *addr);
@@ -224,6 +269,16 @@ static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *c
 	return ctx->flags & KVM_PGTABLE_WALK_SHARED;
 }
 
+static inline bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
+{
+	if (!kvm_pgtable_walk_shared(ctx)) {
+		WRITE_ONCE(*ctx->ptep, new);
+		return true;
+	}
+
+	return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
+}
+
 /**
  * struct kvm_pgtable_walker - Hook into a page-table walk.
  * @cb:		Callback function to invoke during the walk.
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index fbafcbbcc463..438cc36f2c38 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -191,6 +191,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
  */
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_free_stage2_pgd(&kvm->arch.mmu);
 	bitmap_free(kvm->arch.pmu_filter);
 	free_cpumask_var(kvm->arch.supported_cpus);
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index acd233e5586a..1d31ea48d351 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -12,49 +12,6 @@
 #include <asm/stage2_pgtable.h>
 
 
-#define KVM_PTE_TYPE			BIT(1)
-#define KVM_PTE_TYPE_BLOCK		0
-#define KVM_PTE_TYPE_PAGE		1
-#define KVM_PTE_TYPE_TABLE		1
-
-#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
-
-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
-#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
-
-#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
-					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
-					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
-
-#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID		1
-
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED		BIT(10)
-
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable_walker	*walker;
 
@@ -715,16 +672,6 @@ static bool stage2_pte_is_locked(kvm_pte_t pte)
 	return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
 }
 
-static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
-{
-	if (!kvm_pgtable_walk_shared(ctx)) {
-		WRITE_ONCE(*ctx->ptep, new);
-		return true;
-	}
-
-	return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
-}
-
 /**
  * stage2_try_break_pte() - Invalidates a pte according to the
  *			    'break-before-make' requirements of the
@@ -1054,8 +1001,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
 					       kvm_granule_size(ctx->level));
 
-	if (childp)
-		mm_ops->put_page(childp);
+	if (childp) {
+		if (mm_ops->put_page_rcu)
+			mm_ops->put_page_rcu(childp);
+		else
+			mm_ops->put_page(childp);
+	}
 
 	return 0;
 }
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3b9d4d24c361..26a8d955b49c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -172,6 +172,21 @@ static int kvm_host_page_count(void *addr)
 	return page_count(virt_to_page(addr));
 }
 
+static void kvm_s2_rcu_put_page(struct rcu_head *head)
+{
+	put_page(container_of(head, struct page, rcu_head));
+}
+
+static void kvm_s2_put_page_rcu(void *addr)
+{
+	struct page *page = virt_to_page(addr);
+
+	if (kvm_host_page_count(addr) == 1)
+		kvm_account_pgtable_pages(addr, -1);
+
+	call_rcu(&page->rcu_head, kvm_s2_rcu_put_page);
+}
+
 static phys_addr_t kvm_host_pa(void *addr)
 {
 	return __pa(addr);
@@ -704,6 +719,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.free_removed_table	= stage2_free_removed_table,
 	.get_page		= kvm_host_get_page,
 	.put_page		= kvm_s2_put_page,
+	.put_page_rcu		= kvm_s2_put_page_rcu,
 	.page_count		= kvm_host_page_count,
 	.phys_to_virt		= kvm_host_va,
 	.virt_to_phys		= kvm_host_pa,
@@ -1662,6 +1678,42 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 					   range->start << PAGE_SHIFT);
 }
 
+static int stage2_test_clear_young(const struct kvm_pgtable_visit_ctx *ctx,
+				   enum kvm_pgtable_walk_flags flags)
+{
+	kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+
+	VM_WARN_ON_ONCE(!page_count(virt_to_page(ctx->ptep)));
+
+	if (!kvm_pte_valid(new))
+		return 0;
+
+	if (new == ctx->old)
+		return 0;
+
+	if (kvm_should_clear_young(ctx->arg, ctx->addr / PAGE_SIZE))
+		stage2_try_set_pte(ctx, new);
+
+	return 0;
+}
+
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	u64 start = range->start * PAGE_SIZE;
+	u64 end = range->end * PAGE_SIZE;
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_test_clear_young,
+		.arg	= range,
+		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_SHARED,
+	};
+
+	BUILD_BUG_ON(is_hyp_code());
+
+	kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker);
+
+	return false;
+}
+
 phys_addr_t kvm_mmu_get_httbr(void)
 {
 	return __pa(hyp_pgtable->pgd);
@@ -1877,7 +1929,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-	kvm_free_stage2_pgd(&kvm->arch.mmu);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 959f566a455c..3822c1c25d3c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -883,4 +883,12 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
+	       cpu_has_feature(CPU_FTR_HVMODE) && cpu_has_feature(CPU_FTR_ARCH_300) &&
+	       radix_enabled();
+}
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 6bef23d6d0e3..e420ed5dc856 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -276,6 +276,7 @@ struct kvmppc_ops {
 	bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
 	bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
 	bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+	bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range);
 	bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
 	void (*free_memslot)(struct kvm_memory_slot *slot);
 	int (*init_vm)(struct kvm *kvm);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 57f4e7896d67..b03be0072bf8 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -877,6 +877,12 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
 }
 
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return !kvm->arch.kvm_ops->test_clear_young ||
+	       kvm->arch.kvm_ops->test_clear_young(kvm, range);
+}
+
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 58391b4b32ed..fa2659e21ccc 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -12,6 +12,7 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 
 extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9d3743ca16d5..ff8393c4b0f4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -1083,6 +1083,65 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	return ref;
 }
 
+bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	bool err;
+	gfn_t gfn = range->start;
+
+	rcu_read_lock();
+
+	err = !kvm_is_radix(kvm);
+	if (err)
+		goto unlock;
+
+	/*
+	 * Case 1:  This function          kvmppc_switch_mmu_to_hpt()
+	 *
+	 *          rcu_read_lock()
+	 *          Test kvm_is_radix()    kvm->arch.radix = 0
+	 *          Use kvm->arch.pgtable  synchronize_rcu()
+	 *          rcu_read_unlock()
+	 *                                 kvmppc_free_radix()
+	 *
+	 *
+	 * Case 2:  This function          kvmppc_switch_mmu_to_radix()
+	 *
+	 *                                 kvmppc_init_vm_radix()
+	 *                                 smp_wmb()
+	 *          Test kvm_is_radix()    kvm->arch.radix = 1
+	 *          smp_rmb()
+	 *          Use kvm->arch.pgtable
+	 */
+	smp_rmb();
+
+	while (gfn < range->end) {
+		pte_t *ptep;
+		pte_t old, new;
+		unsigned int shift;
+
+		ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift);
+		if (!ptep)
+			goto next;
+
+		VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep)));
+
+		old = READ_ONCE(*ptep);
+		if (!pte_present(old) || !pte_young(old))
+			goto next;
+
+		new = pte_mkold(old);
+
+		if (kvm_should_clear_young(range, gfn))
+			pte_xchg(ptep, old, new);
+next:
+		gfn += shift ? BIT(shift - PAGE_SHIFT) : 1;
+	}
+unlock:
+	rcu_read_unlock();
+
+	return err;
+}
+
 /* Returns the number of PAGE_SIZE pages that are dirty */
 static int kvm_radix_test_clear_dirty(struct kvm *kvm,
 				struct kvm_memory_slot *memslot, int pagenum)
@@ -1464,13 +1523,15 @@ int kvmppc_radix_init(void)
 {
 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
 
-	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
+	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size,
+					  SLAB_TYPESAFE_BY_RCU, pte_ctor);
 	if (!kvm_pte_cache)
 		return -ENOMEM;
 
 	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
 
-	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
+	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size,
+					  SLAB_TYPESAFE_BY_RCU, pmd_ctor);
 	if (!kvm_pmd_cache) {
 		kmem_cache_destroy(kvm_pte_cache);
 		return -ENOMEM;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6ba68dd6190b..b878d097ec0e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 	spin_lock(&kvm->mmu_lock);
 	kvm->arch.radix = 0;
 	spin_unlock(&kvm->mmu_lock);
+	/* see the comments in kvm_test_clear_young_hv() */
+	synchronize_rcu();
 	kvmppc_free_radix(kvm);
 
 	lpcr = LPCR_VPM1;
@@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
 	if (err)
 		return err;
 	kvmppc_rmap_reset(kvm);
+	/* see the comments in kvm_test_clear_young_hv() */
+	smp_wmb();
 	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
 	spin_lock(&kvm->mmu_lock);
 	kvm->arch.radix = 1;
@@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
 	.age_gfn = kvm_age_gfn_hv,
 	.test_age_gfn = kvm_test_age_gfn_hv,
+	.test_clear_young = kvm_test_clear_young_hv,
 	.set_spte_gfn = kvm_set_spte_gfn_hv,
 	.free_memslot = kvmppc_core_free_memslot_hv,
 	.init_vm =  kvmppc_core_init_vm_hv,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 808c292ad3f4..7b8567e5a470 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1783,6 +1783,7 @@ struct kvm_arch_async_pf {
 
 extern u32 __read_mostly kvm_nr_uret_msrs;
 extern u64 __read_mostly host_efer;
+extern u64 __read_mostly shadow_accessed_mask;
 extern bool __read_mostly allow_smaller_maxphyaddr;
 extern bool __read_mostly enable_apicv;
 extern struct kvm_x86_ops kvm_x86_ops;
@@ -1866,6 +1867,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask);
 
 extern bool tdp_enabled;
+#ifdef CONFIG_X86_64
+extern bool tdp_mmu_enabled;
+#else
+#define tdp_mmu_enabled false
+#endif
 
 u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
 
@@ -2204,4 +2210,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
 
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return IS_ENABLED(CONFIG_X86_64) &&
+	       (!IS_REACHABLE(CONFIG_KVM) || (tdp_mmu_enabled && shadow_accessed_mask));
+}
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0f38b78ab04b..c59c7b56a3b7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -253,12 +253,6 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
 	return smp_load_acquire(&kvm->arch.shadow_root_allocated);
 }
 
-#ifdef CONFIG_X86_64
-extern bool tdp_mmu_enabled;
-#else
-#define tdp_mmu_enabled false
-#endif
-
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
 	return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1279db2eab44..a82c4fa1c47b 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
 extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 extern u64 __read_mostly shadow_user_mask;
-extern u64 __read_mostly shadow_accessed_mask;
 extern u64 __read_mostly shadow_dirty_mask;
 extern u64 __read_mostly shadow_mmio_value;
 extern u64 __read_mostly shadow_mmio_mask;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a8f10825e288..e6435275e4f2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1299,6 +1299,40 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	struct kvm_mmu_page *root;
+	int offset = ffs(shadow_accessed_mask) - 1;
+
+	if (kvm_shadow_root_allocated(kvm))
+		return true;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+		struct tdp_iter iter;
+
+		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
+			continue;
+
+		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+			u64 *sptep = rcu_dereference(iter.sptep);
+
+			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
+
+			if (!(iter.old_spte & shadow_accessed_mask))
+				continue;
+
+			if (kvm_should_clear_young(range, iter.gfn))
+				clear_bit(offset, (unsigned long *)sptep);
+		}
+	}
+
+	rcu_read_unlock();
+
+	return false;
+}
+
 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
 			 struct kvm_gfn_range *range)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a9adf75344be..1714f82a0c47 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -258,6 +258,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
+	void *args;
 	gfn_t start;
 	gfn_t end;
 	pte_t pte;
@@ -267,6 +268,27 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn);
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range);
+#endif
+
+/*
+ * Architectures that implement kvm_arch_test_clear_young() should override
+ * kvm_arch_has_test_clear_young().
+ *
+ * kvm_arch_has_test_clear_young() is allowed to return false positive, i.e., it
+ * can return true if kvm_arch_test_clear_young() is supported but disabled due
+ * to some runtime constraint. In this case, kvm_arch_test_clear_young() should
+ * return true; otherwise, it should return false.
+ *
+ * For each young KVM PTE, kvm_arch_test_clear_young() should call
+ * kvm_should_clear_young() to decide whether to clear the accessed bit.
+ */
+#ifndef kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return false;
+}
 #endif
 
 enum {
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 64a3e051c3c4..748e5bddb05f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -60,6 +60,8 @@ enum mmu_notifier_event {
 };
 
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+#define MMU_NOTIFIER_RANGE_LOCKLESS	(1 << 1)
+#define MMU_NOTIFIER_RANGE_YOUNG	(1 << 2)
 
 struct mmu_notifier_ops {
 	/*
@@ -102,25 +104,9 @@ struct mmu_notifier_ops {
 				 unsigned long start,
 				 unsigned long end);
 
-	/*
-	 * clear_young is a lightweight version of clear_flush_young. Like the
-	 * latter, it is supposed to test-and-clear the young/accessed bitflag
-	 * in the secondary pte, but it may omit flushing the secondary tlb.
-	 */
-	int (*clear_young)(struct mmu_notifier *subscription,
-			   struct mm_struct *mm,
-			   unsigned long start,
-			   unsigned long end);
-
-	/*
-	 * test_young is called to check the young/accessed bitflag in
-	 * the secondary pte. This is used to know if the page is
-	 * frequently used without actually clearing the flag or tearing
-	 * down the secondary mapping on the page.
-	 */
-	int (*test_young)(struct mmu_notifier *subscription,
-			  struct mm_struct *mm,
-			  unsigned long address);
+	int (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				bool clear, unsigned long *bitmap);
 
 	/*
 	 * change_pte is called in cases that pte mapping to page is changed:
@@ -387,11 +373,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long start,
 					  unsigned long end);
-extern int __mmu_notifier_clear_young(struct mm_struct *mm,
-				      unsigned long start,
-				      unsigned long end);
-extern int __mmu_notifier_test_young(struct mm_struct *mm,
-				     unsigned long address);
+extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+					   unsigned long start, unsigned long end,
+					   bool clear, unsigned long *bitmap);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
 extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
@@ -428,7 +412,7 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm,
 					   unsigned long end)
 {
 	if (mm_has_notifiers(mm))
-		return __mmu_notifier_clear_young(mm, start, end);
+		return __mmu_notifier_test_clear_young(mm, start, end, true, NULL);
 	return 0;
 }
 
@@ -436,7 +420,36 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
 	if (mm_has_notifiers(mm))
-		return __mmu_notifier_test_young(mm, address);
+		return __mmu_notifier_test_clear_young(mm, address, address + 1, false, NULL);
+	return 0;
+}
+
+/*
+ * mmu_notifier_test_clear_young() returns nonzero if any of the KVM PTEs within
+ * a given range was young: MMU_NOTIFIER_RANGE_LOCKLESS if the fast path was
+ * successful, MMU_NOTIFIER_RANGE_YOUNG otherwise.
+ *
+ * The last parameter to the function is a bitmap and only the fast path
+ * supports it: if it is NULL, the function falls back to the slow path if the
+ * fast path was unsuccessful; otherwise, the function bails out.
+ *
+ * The bitmap has the following specifications:
+ * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
+ * 2. The offset of each bit should be relative to the end, i.e., the offset
+ *    corresponding to addr should be (end-addr)/PAGE_SIZE-1. This is convenient
+ *    for batching while forward looping.
+ *
+ * When testing, this function sets the corresponding bit in the bitmap for each
+ * young KVM PTE. When clearing, this function clears the accessed bit for each
+ * young KVM PTE whose corresponding bit in the bitmap is set.
+ */
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+						unsigned long start, unsigned long end,
+						bool clear, unsigned long *bitmap)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_test_clear_young(mm, start, end, clear, bitmap);
+
 	return 0;
 }
 
@@ -684,12 +697,26 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
 	return 0;
 }
 
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+						unsigned long start, unsigned long end,
+						bool clear, unsigned long *bitmap)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fb1b03b83b2..bcb125f42635 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -379,6 +379,7 @@ enum {
 	LRU_GEN_CORE,
 	LRU_GEN_MM_WALK,
 	LRU_GEN_NONLEAF_YOUNG,
+	LRU_GEN_KVM_MMU_WALK,
 	NR_LRU_GEN_CAPS
 };
 
@@ -485,7 +486,7 @@ struct lru_gen_mm_walk {
 };
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 #ifdef CONFIG_MEMCG
 
@@ -573,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 
-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
+	return false;
 }
 
 #ifdef CONFIG_MEMCG
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 3bd31ea23fee..46c347e56e60 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -489,21 +489,6 @@ TRACE_EVENT(kvm_age_hva,
 		  __entry->start, __entry->end)
 );
 
-TRACE_EVENT(kvm_test_age_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 50c0dde1354f..c7e9747c9920 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -382,44 +382,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
-int __mmu_notifier_clear_young(struct mm_struct *mm,
-			       unsigned long start,
-			       unsigned long end)
+int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+				    unsigned long start, unsigned long end,
+				    bool clear, unsigned long *bitmap)
 {
-	struct mmu_notifier *subscription;
-	int young = 0, id;
+	int idx;
+	struct mmu_notifier *mn;
+	int young = 0;
 
-	id = srcu_read_lock(&srcu);
-	hlist_for_each_entry_rcu(subscription,
-				 &mm->notifier_subscriptions->list, hlist,
-				 srcu_read_lock_held(&srcu)) {
-		if (subscription->ops->clear_young)
-			young |= subscription->ops->clear_young(subscription,
-								mm, start, end);
-	}
-	srcu_read_unlock(&srcu, id);
+	idx = srcu_read_lock(&srcu);
 
-	return young;
-}
+	hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list, hlist,
+				  srcu_read_lock_held(&srcu)) {
+		if (mn->ops->test_clear_young)
+			young |= mn->ops->test_clear_young(mn, mm, start, end, clear, bitmap);
 
-int __mmu_notifier_test_young(struct mm_struct *mm,
-			      unsigned long address)
-{
-	struct mmu_notifier *subscription;
-	int young = 0, id;
-
-	id = srcu_read_lock(&srcu);
-	hlist_for_each_entry_rcu(subscription,
-				 &mm->notifier_subscriptions->list, hlist,
-				 srcu_read_lock_held(&srcu)) {
-		if (subscription->ops->test_young) {
-			young = subscription->ops->test_young(subscription, mm,
-							      address);
-			if (young)
-				break;
-		}
+		if (young && !clear)
+			break;
 	}
-	srcu_read_unlock(&srcu, id);
+
+	srcu_read_unlock(&srcu, idx);
 
 	return young;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 8632e02661ac..53a524b4f948 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -823,12 +823,10 @@ static bool folio_referenced_one(struct folio *folio,
 			return false; /* To break the loop */
 		}
 
-		if (pvmw.pte) {
-			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
-				lru_gen_look_around(&pvmw);
+		if (lru_gen_enabled() && pvmw.pte) {
+			if (lru_gen_look_around(&pvmw))
 				referenced++;
-			}
-
+		} else if (pvmw.pte) {
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte))
 				referenced++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f2767a2e821..f7864a16b3d2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
 #include <linux/khugepaged.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3180,6 +3181,20 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
 #define get_cap(cap)	static_branch_unlikely(&lru_gen_caps[cap])
 #endif
 
+#if IS_ENABLED(CONFIG_KVM)
+#include <linux/kvm_host.h>
+
+static bool should_walk_kvm_mmu(void)
+{
+	return kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_KVM_MMU_WALK);
+}
+#else
+static bool should_walk_kvm_mmu(void)
+{
+	return false;
+}
+#endif
+
 /******************************************************************************
  *                          shorthand helpers
  ******************************************************************************/
@@ -3937,6 +3952,55 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
 	return folio;
 }
 
+static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end,
+			    unsigned long *bitmap, unsigned long *last)
+{
+	if (!should_walk_kvm_mmu())
+		return false;
+
+	if (*last > addr)
+		goto done;
+
+	*last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ?
+		addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1;
+	bitmap_zero(bitmap, MIN_LRU_BATCH);
+
+	mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap);
+done:
+	return test_bit((*last - addr) / PAGE_SIZE, bitmap);
+}
+
+static void clear_spte_young(struct mm_struct *mm, unsigned long addr,
+			     unsigned long *bitmap, unsigned long *last)
+{
+	int i;
+	unsigned long start, end = *last + 1;
+
+	if (addr + PAGE_SIZE != end)
+		return;
+
+	i = find_last_bit(bitmap, MIN_LRU_BATCH);
+	if (i == MIN_LRU_BATCH)
+		return;
+
+	start = end - (i + 1) * PAGE_SIZE;
+
+	i = find_first_bit(bitmap, MIN_LRU_BATCH);
+
+	end -= i * PAGE_SIZE;
+
+	mmu_notifier_test_clear_young(mm, start, end, true, bitmap);
+}
+
+static void skip_spte_young(struct mm_struct *mm, unsigned long addr,
+			    unsigned long *bitmap, unsigned long *last)
+{
+	if (*last > addr)
+		__clear_bit((*last - addr) / PAGE_SIZE, bitmap);
+
+	clear_spte_young(mm, addr, bitmap, last);
+}
+
 static bool suitable_to_scan(int total, int young)
 {
 	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
@@ -3952,6 +4016,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 	unsigned long addr;
+	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
+	unsigned long last = 0;
 	int total = 0;
 	int young = 0;
 	struct lru_gen_mm_walk *walk = args->private;
@@ -3970,6 +4036,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	pte = pte_offset_map(pmd, start & PMD_MASK);
 restart:
 	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+		bool ret;
 		unsigned long pfn;
 		struct folio *folio;
 
@@ -3977,20 +4044,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 		walk->mm_stats[MM_LEAF_TOTAL]++;
 
 		pfn = get_pte_pfn(pte[i], args->vma, addr);
-		if (pfn == -1)
+		if (pfn == -1) {
+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!pte_young(pte[i])) {
+		ret = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last);
+		if (!ret && !pte_young(pte[i])) {
+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
 			walk->mm_stats[MM_LEAF_OLD]++;
 			continue;
 		}
 
 		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
-		if (!folio)
+		if (!folio) {
+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
-			VM_WARN_ON_ONCE(true);
+		clear_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+		if (pte_young(pte[i]))
+			ptep_test_and_clear_young(args->vma, addr, pte + i);
 
 		young++;
 		walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -4599,6 +4673,23 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  *                          rmap/PT walk feedback
  ******************************************************************************/
 
+static bool should_look_around(struct vm_area_struct *vma, unsigned long addr,
+			       pte_t *pte, int *young)
+{
+	int ret = mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
+
+	if (pte_young(*pte)) {
+		ptep_test_and_clear_young(vma, addr, pte);
+		*young = true;
+		return true;
+	}
+
+	if (ret)
+		*young = true;
+
+	return ret & MMU_NOTIFIER_RANGE_LOCKLESS;
+}
+
 /*
  * This function exploits spatial locality when shrink_folio_list() walks the
  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
@@ -4606,12 +4697,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  * the PTE table to the Bloom filter. This forms a feedback loop between the
  * eviction and the aging.
  */
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
 	int i;
 	unsigned long start;
 	unsigned long end;
 	struct lru_gen_mm_walk *walk;
+	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
+	unsigned long last = 0;
 	int young = 0;
 	pte_t *pte = pvmw->pte;
 	unsigned long addr = pvmw->address;
@@ -4625,8 +4718,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	lockdep_assert_held(pvmw->ptl);
 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 
+	if (!should_look_around(pvmw->vma, addr, pte, &young))
+		return young;
+
 	if (spin_is_contended(pvmw->ptl))
-		return;
+		return young;
 
 	/* avoid taking the LRU lock under the PTL when possible */
 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4634,6 +4730,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	start = max(addr & PMD_MASK, pvmw->vma->vm_start);
 	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
 
+	if (end - start == PAGE_SIZE)
+		return young;
+
 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
 		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4647,28 +4746,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 
 	/* folio_update_gen() requires stable folio_memcg() */
 	if (!mem_cgroup_trylock_pages(memcg))
-		return;
+		return young;
 
 	arch_enter_lazy_mmu_mode();
 
 	pte -= (addr - start) / PAGE_SIZE;
 
 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+		bool ret;
 		unsigned long pfn;
 
 		pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
-		if (pfn == -1)
+		if (pfn == -1) {
+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!pte_young(pte[i]))
+		ret = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last);
+		if (!ret && !pte_young(pte[i])) {
+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
 		folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
-		if (!folio)
+		if (!folio) {
+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
 			continue;
+		}
 
-		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
-			VM_WARN_ON_ONCE(true);
+		clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+		if (pte_young(pte[i]))
+			ptep_test_and_clear_young(pvmw->vma, addr, pte + i);
 
 		young++;
 
@@ -4698,6 +4806,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	/* feedback from rmap walkers to page table walkers */
 	if (suitable_to_scan(i, young))
 		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+	return young;
 }
 
 /******************************************************************************
@@ -5717,6 +5827,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
 	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 
+	if (should_walk_kvm_mmu())
+		caps |= BIT(LRU_GEN_KVM_MMU_WALK);
+
 	return sysfs_emit(buf, "0x%04x\n", caps);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4ec97f72f8ce..bad41a295f7b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -538,6 +538,7 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 
 struct kvm_hva_range {
+	void *args;
 	unsigned long start;
 	unsigned long end;
 	pte_t pte;
@@ -546,6 +547,7 @@ struct kvm_hva_range {
 	on_unlock_fn_t on_unlock;
 	bool flush_on_ret;
 	bool may_block;
+	bool lockless;
 };
 
 /*
@@ -599,6 +601,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			hva_end = min(range->end, slot->userspace_addr +
 						  (slot->npages << PAGE_SHIFT));
 
+			gfn_range.args = range->args;
+
 			/*
 			 * To optimize for the likely case where the address
 			 * range is covered by zero or one memslots, don't
@@ -616,7 +620,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
+			if (!range->lockless && !locked) {
 				locked = true;
 				KVM_MMU_LOCK(kvm);
 				if (!IS_KVM_NULL_FN(range->on_lock))
@@ -625,6 +629,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 					break;
 			}
 			ret |= range->handler(kvm, &gfn_range);
+
+			if (range->lockless && ret)
+				break;
 		}
 	}
 
@@ -664,25 +671,6 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 	return __kvm_handle_hva_range(kvm, &range);
 }
 
-static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
-							 unsigned long start,
-							 unsigned long end,
-							 hva_handler_t handler)
-{
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	const struct kvm_hva_range range = {
-		.start		= start,
-		.end		= end,
-		.pte		= __pte(0),
-		.handler	= handler,
-		.on_lock	= (void *)kvm_null_fn,
-		.on_unlock	= (void *)kvm_null_fn,
-		.flush_on_ret	= false,
-		.may_block	= false,
-	};
-
-	return __kvm_handle_hva_range(kvm, &range);
-}
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long address,
@@ -844,37 +832,70 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 }
 
-static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long start,
-					unsigned long end)
+struct test_clear_young_args {
+	unsigned long *bitmap;
+	unsigned long end;
+	bool clear;
+	bool young;
+};
+
+bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn)
 {
-	trace_kvm_age_hva(start, end);
+	struct test_clear_young_args *args = range->args;
 
-	/*
-	 * Even though we do not flush TLB, this will still adversely
-	 * affect performance on pre-Haswell Intel EPT, where there is
-	 * no EPT Access Bit to clear so that we have to tear down EPT
-	 * tables instead. If we find this unacceptable, we can always
-	 * add a parameter to kvm_age_hva so that it effectively doesn't
-	 * do anything on clear_young.
-	 *
-	 * Also note that currently we never issue secondary TLB flushes
-	 * from clear_young, leaving this job up to the regular system
-	 * cadence. If we find this inaccurate, we might come up with a
-	 * more sophisticated heuristic later.
-	 */
-	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
+	VM_WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
+
+	args->young = true;
+
+	if (args->bitmap) {
+		int offset = hva_to_gfn_memslot(args->end - 1, range->slot) - gfn;
+
+		if (args->clear)
+			return test_bit(offset, args->bitmap);
+
+		__set_bit(offset, args->bitmap);
+	}
+
+	return args->clear;
 }
 
-static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
-				       struct mm_struct *mm,
-				       unsigned long address)
+static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
+					     unsigned long start, unsigned long end,
+					     bool clear, unsigned long *bitmap)
 {
-	trace_kvm_test_age_hva(address);
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	struct kvm_hva_range range = {
+		.start		= start,
+		.end		= end,
+		.on_lock	= (void *)kvm_null_fn,
+		.on_unlock	= (void *)kvm_null_fn,
+	};
+
+	trace_kvm_age_hva(start, end);
+
+	if (kvm_arch_has_test_clear_young()) {
+		struct test_clear_young_args args = {
+			.bitmap	= bitmap,
+			.end	= end,
+			.clear	= clear,
+		};
+
+		range.args = &args;
+		range.lockless = true;
+		range.handler = kvm_arch_test_clear_young;
+
+		if (!__kvm_handle_hva_range(kvm, &range))
+			return args.young ? MMU_NOTIFIER_RANGE_LOCKLESS : 0;
+	}
+
+	if (bitmap)
+		return 0;
+
+	range.args = NULL;
+	range.lockless = false;
+	range.handler = clear ? kvm_age_gfn : kvm_test_age_gfn;
 
-	return kvm_handle_hva_range_no_flush(mn, address, address + 1,
-					     kvm_test_age_gfn);
+	return __kvm_handle_hva_range(kvm, &range) ? MMU_NOTIFIER_RANGE_YOUNG : 0;
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -893,8 +914,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
-	.clear_young		= kvm_mmu_notifier_clear_young,
-	.test_young		= kvm_mmu_notifier_test_young,
+	.test_clear_young	= kvm_mmu_notifier_test_clear_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
-- 
2.41.0

From 160230318f701eee806555c90b3a96071024cfe5 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Fri, 9 Jun 2023 20:07:21 +0200
Subject: [PATCH 09/12] Per-VMA locks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous versions:
v3: https://lore.kernel.org/all/20230216051750.3125598-1-surenb@google.com/
v2: https://lore.kernel.org/lkml/20230127194110.533103-1-surenb@google.com/
v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/

LWN article describing the feature:
https://lwn.net/Articles/906852/

Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
last year [2], which concluded with suggestion that “a reader/writer
semaphore could be put into the VMA itself; that would have the effect of
using the VMA as a sort of range lock. There would still be contention at
the VMA level, but it would be an improvement.” This patchset implements
this suggested approach.

When handling page faults we lookup the VMA that contains the faulting
page under RCU protection and try to acquire its lock. If that fails we
fall back to using mmap_lock, similar to how SPF handled this situation.

One notable way the implementation deviates from the proposal is the way
VMAs are read-locked. During some of mm updates, multiple VMAs need to be
locked until the end of the update (e.g. vma_merge, split_vma, etc).
Tracking all the locked VMAs, avoiding recursive locks, figuring out when
it's safe to unlock previously locked VMAs would make the code more
complex. So, instead of the usual lock/unlock pattern, the proposed
solution marks a VMA as locked and provides an efficient way to:
1. Identify locked VMAs.
2. Unlock all locked VMAs in bulk.
We also postpone unlocking the locked VMAs until the end of the update,
when we do mmap_write_unlock. Potentially this keeps a VMA locked for
longer than is absolutely necessary but it results in a big reduction of
code complexity.
Read-locking a VMA is done using two sequence numbers - one in the
vm_area_struct and one in the mm_struct. VMA is considered read-locked
when these sequence numbers are equal. To read-lock a VMA we set the
sequence number in vm_area_struct to be equal to the sequence number in
mm_struct. To unlock all VMAs we increment mm_struct's seq number. This
allows for an efficient way to track locked VMAs and to drop the locks on
all VMAs at the end of the update.

The patchset implements per-VMA locking only for anonymous pages which
are not in swap and avoids userfaultfs as their implementation is more
complex. Additional support for file-back page faults, swapped and user
pages can be added incrementally.

Performance benchmarks show similar although slightly smaller benefits as
with SPF patchset (~75% of SPF benefits). Still, with lower complexity
this approach might be more desirable.

Since RFC was posted in September 2022, two separate Google teams outside
of Android evaluated the patchset and confirmed positive results. Here are
the known usecases when per-VMA locks show benefits:

Android:
Apps with high number of threads (~100) launch times improve by up to 20%.
Each thread mmaps several areas upon startup (Stack and Thread-local
storage (TLS), thread signal stack, indirect ref table), which requires
taking mmap_lock in write mode. Page faults take mmap_lock in read mode.
During app launch, both thread creation and page faults establishing the
active workinget are happening in parallel and that causes lock contention
between mm writers and readers even if updates and page faults are
happening in different VMAs. Per-vma locks prevent this contention by
providing more granular lock.

Google Fibers:
We have several dynamically sized thread pools that spawn new threads
under increased load and reduce their number when idling. For example,
Google's in-process scheduling/threading framework, UMCG/Fibers, is backed
by such a thread pool. When idling, only a small number of idle worker
threads are available; when a spike of incoming requests arrive, each
request is handled in its own "fiber", which is a work item posted onto a
UMCG worker thread; quite often these spikes lead to a number of new
threads spawning. Each new thread needs to allocate and register an RSEQ
section on its TLS, then register itself with the kernel as a UMCG worker
thread, and only after that it can be considered by the in-process
UMCG/Fiber scheduler as available to do useful work. In short, during an
incoming workload spike new threads have to be spawned, and they perform
several syscalls (RSEQ registration, UMCG worker registration, memory
allocations) before they can actually start doing useful work. Removing
any bottlenecks on this thread startup path will greatly improve our
services' latencies when faced with request/workload spikes.
At high scale, mmap_lock contention during thread creation and stack page
faults leads to user-visible multi-second serving latencies in a similar
pattern to Android app startup. Per-VMA locking patchset has been run
successfully in limited experiments with user-facing production workloads.
In these experiments, we observed that the peak thread creation rate was
high enough that thread creation is no longer a bottleneck.

TCP zerocopy receive:
From the point of view of TCP zerocopy receive, the per-vma lock patch is
massively beneficial.
In today's implementation, a process with N threads where N - 1 are
performing zerocopy receive and 1 thread is performing madvise() with the
write lock taken (e.g. needs to change vm_flags) will result in all N -1
receive threads blocking until the madvise is done. Conversely, on a busy
process receiving a lot of data, an madvise operation that does need to
take the mmap lock in write mode will need to wait for all of the receives
to be done - a lose:lose proposition. Per-VMA locking _removes_ by
definition this source of contention entirely.
There are other benefits for receive as well, chiefly a reduction in
cacheline bouncing across receiving threads for locking/unlocking the
single mmap lock. On an RPC style synthetic workload with 4KB RPCs:
1a) The find+lock+unlock VMA path in the base case, without the per-vma
lock patchset, is about 0.7% of cycles as measured by perf.
1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5%
cycles overall - most of this is within the TCP read hotpath (a small
fraction is 'other' usage in the system).
2a) The find+lock+unlock VMA path, with the per-vma patchset and a trivial
patch written to take advantage of it in TCP, is about 0.4% of cycles
(down from 0.7% above)
2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is < 0.1%
cycles and is out of the TCP read hotpath entirely (down from 0.5% before,
the remaining usage is the 'other' usage in the system).
So, in addition to entirely removing an onerous source of contention, it
also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+
(compared to overall cycles in perf) for the 'small' RPC scenario.

The patchset structure is:
0001-0008: Enable maple-tree RCU mode
0009-0031: Main per-vma locks patchset
0032-0033: Performance optimizations

Changes since v3:
- Changed patch [3] to move vma_prepare before vma_adjust_trans_huge
- Dropped patch [4] from the set as unnecessary, per Hyeonggon Yoo
- Changed patch [5] to do VMA locking inside vma_prepare, per Liam Howlett
- Dropped patch [6] from the set as unnecessary, per Liam Howlett

[1] https://lore.kernel.org/all/20220128131006.67712-1-michel@lespinasse.org/
[2] https://lwn.net/Articles/893906/
[3] https://lore.kernel.org/all/20230216051750.3125598-15-surenb@google.com/
[4] https://lore.kernel.org/all/20230216051750.3125598-17-surenb@google.com/
[5] https://lore.kernel.org/all/20230216051750.3125598-18-surenb@google.com/
[6] https://lore.kernel.org/all/20230216051750.3125598-22-surenb@google.com/

The patchset applies cleanly over mm-unstable branch.

Laurent Dufour (1):
  powerc/mm: try VMA lock-based page fault handling first

Liam Howlett (4):
  maple_tree: Be more cautious about dead nodes
  maple_tree: Detect dead nodes in mas_start()
  maple_tree: Fix freeing of nodes in rcu mode
  maple_tree: remove extra smp_wmb() from mas_dead_leaves()

Liam R. Howlett (4):
  maple_tree: Fix write memory barrier of nodes once dead for RCU mode
  maple_tree: Add smp_rmb() to dead node detection
  maple_tree: Add RCU lock checking to rcu callback functions
  mm: Enable maple tree RCU mode by default.

Michel Lespinasse (1):
  mm: rcu safe VMA freeing

Suren Baghdasaryan (23):
  mm: introduce CONFIG_PER_VMA_LOCK
  mm: move mmap_lock assert function definitions
  mm: add per-VMA lock and helper functions to control it
  mm: mark VMA as being written when changing vm_flags
  mm/mmap: move vma_prepare before vma_adjust_trans_huge
  mm/khugepaged: write-lock VMA while collapsing a huge page
  mm/mmap: write-lock VMAs in vma_prepare before modifying them
  mm/mremap: write-lock VMA while remapping it to a new address range
  mm: write-lock VMAs before removing them from VMA tree
  mm: conditionally write-lock VMA in free_pgtables
  kernel/fork: assert no VMA readers during its destruction
  mm/mmap: prevent pagefault handler from racing with mmu_notifier
    registration
  mm: introduce vma detached flag
  mm: introduce lock_vma_under_rcu to be used from arch-specific code
  mm: fall back to mmap_lock if vma->anon_vma is not yet set
  mm: add FAULT_FLAG_VMA_LOCK flag
  mm: prevent do_swap_page from handling page faults under VMA lock
  mm: prevent userfaults to be handled under per-vma lock
  mm: introduce per-VMA lock statistics
  x86/mm: try VMA lock-based page fault handling first
  arm64/mm: try VMA lock-based page fault handling first
  mm/mmap: free vm_area_struct without call_rcu in exit_mmap
  mm: separate vma->lock from vm_area_struct

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 arch/arm64/Kconfig                     |   1 +
 arch/arm64/mm/fault.c                  |  35 +++++++
 arch/powerpc/mm/fault.c                |  37 +++++++
 arch/powerpc/platforms/powernv/Kconfig |   1 +
 arch/powerpc/platforms/pseries/Kconfig |   1 +
 arch/s390/Kconfig                      |   1 +
 arch/s390/mm/fault.c                   |  24 +++++
 arch/x86/Kconfig                       |   1 +
 arch/x86/mm/fault.c                    |  36 +++++++
 include/linux/mm.h                     | 127 +++++++++++++++++++++++--
 include/linux/mm_types.h               |  30 +++++-
 include/linux/mmap_lock.h              |  37 ++++---
 include/linux/vm_event_item.h          |   6 ++
 include/linux/vmstat.h                 |   6 ++
 kernel/fork.c                          |  96 ++++++++++++++++---
 mm/Kconfig                             |  12 +++
 mm/Kconfig.debug                       |  12 +++
 mm/init-mm.c                           |   3 +
 mm/internal.h                          |   2 +-
 mm/khugepaged.c                        |   8 ++
 mm/memory.c                            |  72 +++++++++++++-
 mm/mmap.c                              |  48 +++++++---
 mm/mremap.c                            |   1 +
 mm/rmap.c                              |  31 +++---
 mm/vmstat.c                            |   6 ++
 25 files changed, 572 insertions(+), 62 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1023e896d46b..6f104c829731 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -95,6 +95,7 @@ config ARM64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
+	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index d1136259b7b8..6045a5117ac1 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 	unsigned long vm_flags;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
 	unsigned long addr = untagged_addr(far);
+#ifdef CONFIG_PER_VMA_LOCK
+	struct vm_area_struct *vma;
+#endif
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -585,6 +588,35 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
+#ifdef CONFIG_PER_VMA_LOCK
+	if (!(mm_flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+
+	vma = lock_vma_under_rcu(mm, addr);
+	if (!vma)
+		goto lock_mmap;
+
+	if (!(vma->vm_flags & vm_flags)) {
+		vma_end_read(vma);
+		goto lock_mmap;
+	}
+	fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
+	vma_end_read(vma);
+
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto done;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+	/* Quick path to respond to signals */
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			goto no_context;
+		return 0;
+	}
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
 	/*
 	 * As per x86, we may deadlock here. However, since the kernel only
 	 * validly references user space from well defined areas of the code,
@@ -628,6 +660,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 	}
 	mmap_read_unlock(mm);
 
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
 	/*
 	 * Handle the "normal" (no error) case first.
 	 */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index af46aa88422b..531177a4ee08 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (is_exec)
 		flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef CONFIG_PER_VMA_LOCK
+	if (!(flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+
+	vma = lock_vma_under_rcu(mm, address);
+	if (!vma)
+		goto lock_mmap;
+
+	if (unlikely(access_pkey_error(is_write, is_exec,
+				       (error_code & DSISR_KEYFAULT), vma))) {
+		vma_end_read(vma);
+		goto lock_mmap;
+	}
+
+	if (unlikely(access_error(is_write, is_exec, vma))) {
+		vma_end_read(vma);
+		goto lock_mmap;
+	}
+
+	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+	vma_end_read(vma);
+
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto done;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+	if (fault_signal_pending(fault, regs))
+		return user_mode(regs) ? 0 : SIGBUS;
+
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -550,6 +584,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	mmap_read_unlock(current->mm);
 
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
 	if (unlikely(fault & VM_FAULT_ERROR))
 		return mm_fault_error(regs, address, fault);
 
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index ae248a161b43..70a46acc70d6 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -16,6 +16,7 @@ config PPC_POWERNV
 	select PPC_DOORBELL
 	select MMU_NOTIFIER
 	select FORCE_SMP
+	select ARCH_SUPPORTS_PER_VMA_LOCK
 	default y
 
 config OPAL_PRD
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 21b22bf16ce6..4ebf2ef2845d 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -22,6 +22,7 @@ config PPC_PSERIES
 	select HOTPLUG_CPU
 	select FORCE_SMP
 	select SWIOTLB
+	select ARCH_SUPPORTS_PER_VMA_LOCK
 	default y
 
 config PARAVIRT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 35f15c23c491..ab01dd28aec4 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -116,6 +116,7 @@ config S390
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	select ARCH_SUPPORTS_HUGETLBFS
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index a2632fd97d00..b65144c392b0 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 		access = VM_WRITE;
 	if (access == VM_WRITE)
 		flags |= FAULT_FLAG_WRITE;
+#ifdef CONFIG_PER_VMA_LOCK
+	if (!(flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+	vma = lock_vma_under_rcu(mm, address);
+	if (!vma)
+		goto lock_mmap;
+	if (!(vma->vm_flags & access)) {
+		vma_end_read(vma);
+		goto lock_mmap;
+	}
+	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+	vma_end_read(vma);
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto out;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	/* Quick path to respond to signals */
+	if (fault_signal_pending(fault, regs)) {
+		fault = VM_FAULT_SIGNAL;
+		goto out;
+	}
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
 	mmap_read_lock(mm);
 
 	gmap = NULL;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a825bf031f49..df21fba77db1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
 	# Options that are inherently 64-bit kernel only:
 	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
 	select MODULES_USE_ELF_RELA
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a498ae1fbe66..e4399983c50c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -19,6 +19,7 @@
 #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 #include <linux/efi.h>			/* efi_crash_gracefully_on_page_fault()*/
 #include <linux/mm_types.h>
+#include <linux/mm.h>			/* find_and_lock_vma() */
 
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
 	}
 #endif
 
+#ifdef CONFIG_PER_VMA_LOCK
+	if (!(flags & FAULT_FLAG_USER))
+		goto lock_mmap;
+
+	vma = lock_vma_under_rcu(mm, address);
+	if (!vma)
+		goto lock_mmap;
+
+	if (unlikely(access_error(error_code, vma))) {
+		vma_end_read(vma);
+		goto lock_mmap;
+	}
+	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+	vma_end_read(vma);
+
+	if (!(fault & VM_FAULT_RETRY)) {
+		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+		goto done;
+	}
+	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+	/* Quick path to respond to signals */
+	if (fault_signal_pending(fault, regs)) {
+		if (!user_mode(regs))
+			kernelmode_fixup_or_oops(regs, error_code, address,
+						 SIGBUS, BUS_ADRERR,
+						 ARCH_DEFAULT_PKEY);
+		return;
+	}
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
 	/*
 	 * Kernel-mode access to the user address space should only occur
 	 * on well-defined single instructions listed in the exception
@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs,
 	}
 
 	mmap_read_unlock(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
 	if (likely(!(fault & VM_FAULT_ERROR)))
 		return;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7e0a7b60c9f9..f7ab97369477 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -256,6 +256,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -478,7 +480,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
 	{ FAULT_FLAG_USER,		"USER" }, \
 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
+	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
+	{ FAULT_FLAG_VMA_LOCK,		"VMA_LOCK" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
@@ -623,6 +626,117 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+	/* Check before locking. A race might cause false locked result. */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+		return false;
+
+	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result.
+	 * False unlocked result is impossible because we modify and check
+	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+	 * modification invalidates all existing locks.
+	 */
+	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+		up_read(&vma->vm_lock->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+	rcu_read_lock(); /* keeps vma alive till the end of up_read */
+	up_read(&vma->vm_lock->lock);
+	rcu_read_unlock();
+}
+
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	*mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
+		return;
+
+	down_write(&vma->vm_lock->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
+		return true;
+
+	if (!down_write_trylock(&vma->vm_lock->lock))
+		return false;
+
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->vm_lock->lock);
+	return true;
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+	/* When detaching vma should be write-locked */
+	if (detached)
+		vma_assert_write_locked(vma);
+	vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+					  unsigned long address);
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+		{ return true; }
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+				     bool detached) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -631,6 +745,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_mark_detached(vma, false);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
@@ -644,28 +759,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
 static inline void vm_flags_reset(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	vm_flags_init(vma, flags);
 }
 
 static inline void vm_flags_reset_once(struct vm_area_struct *vma,
 				       vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
 }
 
 static inline void vm_flags_set(struct vm_area_struct *vma,
 				vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	ACCESS_PRIVATE(vma, __vm_flags) |= flags;
 }
 
 static inline void vm_flags_clear(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
 }
 
@@ -686,7 +801,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
 static inline void vm_flags_mod(struct vm_area_struct *vma,
 				vm_flags_t set, vm_flags_t clear)
 {
-	mmap_assert_write_locked(vma->vm_mm);
+	vma_start_write(vma);
 	__vm_flags_mod(vma, set, clear);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a57e6ae78e65..ac4b5df9ba56 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,10 @@ struct anon_vma_name {
 	char name[];
 };
 
+struct vma_lock {
+	struct rw_semaphore lock;
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -480,9 +484,16 @@ struct anon_vma_name {
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
 
-	unsigned long vm_start;		/* Our start address within vm_mm. */
-	unsigned long vm_end;		/* The first byte after our end address
-					   within vm_mm. */
+	union {
+		struct {
+			/* VMA covers [vm_start; vm_end) addresses within mm */
+			unsigned long vm_start;
+			unsigned long vm_end;
+		};
+#ifdef CONFIG_PER_VMA_LOCK
+		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
+#endif
+	};
 
 	struct mm_struct *vm_mm;	/* The address space we belong to. */
 
@@ -501,6 +512,14 @@ struct vm_area_struct {
 		vm_flags_t __private __vm_flags;
 	};
 
+#ifdef CONFIG_PER_VMA_LOCK
+	int vm_lock_seq;
+	struct vma_lock *vm_lock;
+
+	/* Flag to indicate areas detached from the mm->mm_mt tree */
+	bool detached;
+#endif
+
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
@@ -637,6 +656,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
@@ -1037,6 +1059,7 @@ typedef struct {
  *                      mapped after the fault.
  * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
  *                        We should only access orig_pte if this flag set.
+ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
  *
  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
  * whether we would allow page faults to retry by specifying these two
@@ -1074,6 +1097,7 @@ enum fault_flag {
 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
 	FAULT_FLAG_UNSHARE =		1 << 10,
 	FAULT_FLAG_ORIG_PTE_VALID =	1 << 11,
+	FAULT_FLAG_VMA_LOCK =		1 << 12,
 };
 
 typedef unsigned int __bitwise zap_flags_t;
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 96e113e23d04..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
 
 #endif /* CONFIG_TRACING */
 
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+	lockdep_assert_held(&mm->mmap_lock);
+	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+	lockdep_assert_held_write(&mm->mmap_lock);
+	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_end_write_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_end_write_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
 	up_read_non_owner(&mm->mmap_lock);
 }
 
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
-	lockdep_assert_held(&mm->mmap_lock);
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
-	lockdep_assert_held_write(&mm->mmap_lock);
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
 static inline int mmap_lock_is_contended(struct mm_struct *mm)
 {
 	return rwsem_is_contended(&mm->mmap_lock);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7f5d1caf5890..8abfa1240040 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_X86
 		DIRECT_MAP_LEVEL2_SPLIT,
 		DIRECT_MAP_LEVEL3_SPLIT,
+#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+		VMA_LOCK_SUCCESS,
+		VMA_LOCK_ABORT,
+		VMA_LOCK_RETRY,
+		VMA_LOCK_MISS,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 19cf5b6892ce..fed855bae6d8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
 #define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
 #endif
 
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+#define count_vm_vma_lock_event(x) count_vm_event(x)
+#else
+#define count_vm_vma_lock_event(x) do {} while (0)
+#endif
+
 #define __count_zid_vm_events(item, zid, delta) \
 	__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 90e639a39cf6..1efcd64e4313 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -455,13 +455,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -469,26 +505,54 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+						  vm_rcu);
+
+	/* The vma should not be locked while being destroyed. */
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+	__vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+	__vm_area_free(vma);
+#endif
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -1132,6 +1196,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	mm->mm_lock_seq = 0;
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
@@ -3074,6 +3141,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
diff --git a/mm/Kconfig b/mm/Kconfig
index cf2e47030fe8..459af2123189 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1202,6 +1202,18 @@ config LRU_GEN_STATS
 	  This option has a per-memcg and per-node memory overhead.
 # }
 
+config ARCH_SUPPORTS_PER_VMA_LOCK
+       def_bool n
+
+config PER_VMA_LOCK
+	def_bool y
+	depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
+	help
+	  Allow per-vma locking during page fault handling.
+
+	  This feature allows locking each virtual memory area separately when
+	  handling page faults instead of taking mmap_lock.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c7d27c3f2e96..24df3a9a45b5 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -280,3 +280,15 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
 
 	  If unsure, say Y.
 
+config PER_VMA_LOCK_STATS
+	bool "Statistics for per-vma locks"
+	depends on PER_VMA_LOCK
+	help
+	  Say Y here to enable success, retry and failure counters of page
+	  faults handled under protection of per-vma locks. When enabled, the
+	  counters are exposed in /proc/vmstat. This information is useful for
+	  kernel developers to evaluate effectiveness of per-vma locks and to
+	  identify pathological cases. Counting these events introduces a small
+	  overhead in the page fault path.
+
+	  If in doubt, say N.
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
diff --git a/mm/internal.h b/mm/internal.h
index 7920a8b7982e..0c455d6e4e3e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio);
 
 void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		   struct vm_area_struct *start_vma, unsigned long floor,
-		   unsigned long ceiling);
+		   unsigned long ceiling, bool mm_wr_locked);
 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 
 struct zap_details;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0ec69b96b497..37a52a0ec9da 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1053,6 +1053,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 
+	vma_start_write(vma);
 	anon_vma_lock_write(vma->anon_vma);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1516,6 +1517,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		goto drop_hpage;
 	}
 
+	/* Lock the vma before taking i_mmap and page table locks */
+	vma_start_write(vma);
+
 	/*
 	 * We need to lock the mapping so that from here on, only GUP-fast and
 	 * hardware page walks can access the parts of the page tables that
@@ -1693,6 +1697,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
 		result = SCAN_PTE_MAPPED_HUGEPAGE;
 		if ((cc->is_khugepaged || is_target) &&
 		    mmap_write_trylock(mm)) {
+			/* trylock for the same lock inversion as above */
+			if (!vma_try_start_write(vma))
+				goto unlock_next;
+
 			/*
 			 * Re-check whether we have an ->anon_vma, because
 			 * collapse_and_free_pmd() requires that either no
diff --git a/mm/memory.c b/mm/memory.c
index 01a23ad48a04..c76183ced67a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -348,7 +348,7 @@ void free_pgd_range(struct mmu_gather *tlb,
 
 void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		   struct vm_area_struct *vma, unsigned long floor,
-		   unsigned long ceiling)
+		   unsigned long ceiling, bool mm_wr_locked)
 {
 	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
 
@@ -366,6 +366,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		 * Hide vma from rmap and truncate_pagecache before freeing
 		 * pgtables
 		 */
+		if (mm_wr_locked)
+			vma_start_write(vma);
 		unlink_anon_vmas(vma);
 		unlink_file_vma(vma);
 
@@ -380,6 +382,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = mas_find(&mas, ceiling - 1);
+				if (mm_wr_locked)
+					vma_start_write(vma);
 				unlink_anon_vmas(vma);
 				unlink_file_vma(vma);
 			}
@@ -3698,6 +3702,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (!pte_unmap_same(vmf))
 		goto out;
 
+	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+		ret = VM_FAULT_RETRY;
+		goto out;
+	}
+
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
@@ -5230,6 +5239,67 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
 
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+					  unsigned long address)
+{
+	MA_STATE(mas, &mm->mm_mt, address, address);
+	struct vm_area_struct *vma;
+
+	rcu_read_lock();
+retry:
+	vma = mas_walk(&mas);
+	if (!vma)
+		goto inval;
+
+	/* Only anonymous vmas are supported for now */
+	if (!vma_is_anonymous(vma))
+		goto inval;
+
+	/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
+	if (!vma->anon_vma)
+		goto inval;
+
+	if (!vma_start_read(vma))
+		goto inval;
+
+	/*
+	 * Due to the possibility of userfault handler dropping mmap_lock, avoid
+	 * it for now and fall back to page fault handling under mmap_lock.
+	 */
+	if (userfaultfd_armed(vma)) {
+		vma_end_read(vma);
+		goto inval;
+	}
+
+	/* Check since vm_start/vm_end might change before we lock the VMA */
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+		vma_end_read(vma);
+		goto inval;
+	}
+
+	/* Check if the VMA got isolated after we found it */
+	if (vma->detached) {
+		vma_end_read(vma);
+		count_vm_vma_lock_event(VMA_LOCK_MISS);
+		/* The area was replaced with another one */
+		goto retry;
+	}
+
+	rcu_read_unlock();
+	return vma;
+inval:
+	rcu_read_unlock();
+	count_vm_vma_lock_event(VMA_LOCK_ABORT);
+	return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
 #ifndef __PAGETABLE_P4D_FOLDED
 /*
  * Allocate p4d page table.
diff --git a/mm/mmap.c b/mm/mmap.c
index eefa6f0cda28..7aa6241cd773 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
 /*
  * Close a vm structure and free it.
  */
-static void remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma, bool unreachable)
 {
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	vm_area_free(vma);
+	if (unreachable)
+		__vm_area_free(vma);
+	else
+		vm_area_free(vma);
 }
 
 static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp,
  */
 static inline void vma_prepare(struct vma_prepare *vp)
 {
+	vma_start_write(vp->vma);
+	if (vp->adj_next)
+		vma_start_write(vp->adj_next);
+	/* vp->insert is always a newly created VMA, no need for locking */
+	if (vp->remove)
+		vma_start_write(vp->remove);
+	if (vp->remove2)
+		vma_start_write(vp->remove2);
+
 	if (vp->file) {
 		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
 
@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp,
 
 	if (vp->remove) {
 again:
+		vma_mark_detached(vp->remove, true);
 		if (vp->file) {
 			uprobe_munmap(vp->remove, vp->remove->vm_start,
 				      vp->remove->vm_end);
@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	if (vma_iter_prealloc(vmi))
 		goto nomem;
 
+	vma_prepare(&vp);
 	vma_adjust_trans_huge(vma, start, end, 0);
 	/* VMA iterator points to previous, so set to start if necessary */
 	if (vma_iter_addr(vmi) != start)
 		vma_iter_set(vmi, start);
 
-	vma_prepare(&vp);
 	vma->vm_start = start;
 	vma->vm_end = end;
 	vma->vm_pgoff = pgoff;
@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		return -ENOMEM;
 
 	init_vma_prep(&vp, vma);
-	vma_adjust_trans_huge(vma, start, end, 0);
 	vma_prepare(&vp);
+	vma_adjust_trans_huge(vma, start, end, 0);
 
 	if (vma->vm_start < start)
 		vma_iter_clear(vmi, vma->vm_start, start);
@@ -994,12 +1007,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
 	if (vma_iter_prealloc(vmi))
 		return NULL;
 
-	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
 	init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
 	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
 		   vp.anon_vma != adjust->anon_vma);
 
 	vma_prepare(&vp);
+	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
 	if (vma_start < vma->vm_start || vma_end > vma->vm_end)
 		vma_expanded = true;
 
@@ -2157,7 +2170,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += nrpages;
 		vm_stat_account(mm, vma->vm_flags, -nrpages);
-		remove_vma(vma);
+		remove_vma(vma, false);
 	}
 	vm_unacct_memory(nr_accounted);
 	validate_mm(mm);
@@ -2180,7 +2193,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
 	free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-				 next ? next->vm_start : USER_PGTABLES_CEILING);
+				 next ? next->vm_start : USER_PGTABLES_CEILING,
+				 mm_wr_locked);
 	tlb_finish_mmu(&tlb);
 }
 
@@ -2236,10 +2250,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
 
-	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
 	init_vma_prep(&vp, vma);
 	vp.insert = new;
 	vma_prepare(&vp);
+	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
 
 	if (new_below) {
 		vma->vm_start = addr;
@@ -2283,10 +2297,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 static inline int munmap_sidetree(struct vm_area_struct *vma,
 				   struct ma_state *mas_detach)
 {
+	vma_start_write(vma);
 	mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
 	if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
 		return -ENOMEM;
 
+	vma_mark_detached(vma, true);
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm -= vma_pages(vma);
 
@@ -2942,9 +2958,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		if (vma_iter_prealloc(vmi))
 			goto unacct_fail;
 
-		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
 		init_vma_prep(&vp, vma);
 		vma_prepare(&vp);
+		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
 		vma->vm_end = addr + len;
 		vm_flags_set(vma, VM_SOFTDIRTY);
 		vma_iter_store(vmi, vma);
@@ -3077,7 +3093,7 @@ void exit_mmap(struct mm_struct *mm)
 	mmap_write_lock(mm);
 	mt_clear_in_rcu(&mm->mm_mt);
 	free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
-		      USER_PGTABLES_CEILING);
+		      USER_PGTABLES_CEILING, true);
 	tlb_finish_mmu(&tlb);
 
 	/*
@@ -3088,7 +3104,7 @@ void exit_mmap(struct mm_struct *mm)
 	do {
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
-		remove_vma(vma);
+		remove_vma(vma, true);
 		count++;
 		cond_resched();
 	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
@@ -3211,6 +3227,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			get_file(new_vma->vm_file);
 		if (new_vma->vm_ops && new_vma->vm_ops->open)
 			new_vma->vm_ops->open(new_vma);
+		vma_start_write(new_vma);
 		if (vma_link(mm, new_vma))
 			goto out_vma_link;
 		*need_rmap_locks = false;
@@ -3505,6 +3522,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * of mm/rmap.c:
  *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
  *     hugetlb mapping);
+ *   - all vmas marked locked
  *   - all i_mmap_rwsem locks;
  *   - all anon_vma->rwseml
  *
@@ -3527,6 +3545,13 @@ int mm_take_all_locks(struct mm_struct *mm)
 
 	mutex_lock(&mm_all_locks_mutex);
 
+	mas_for_each(&mas, vma, ULONG_MAX) {
+		if (signal_pending(current))
+			goto out_unlock;
+		vma_start_write(vma);
+	}
+
+	mas_set(&mas, 0);
 	mas_for_each(&mas, vma, ULONG_MAX) {
 		if (signal_pending(current))
 			goto out_unlock;
@@ -3616,6 +3641,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
 		if (vma->vm_file && vma->vm_file->f_mapping)
 			vm_unlock_mapping(vma->vm_file->f_mapping);
 	}
+	vma_end_write_all(mm);
 
 	mutex_unlock(&mm_all_locks_mutex);
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 411a85682b58..dd541e59edda 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 			return -ENOMEM;
 	}
 
+	vma_start_write(vma);
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 			   &need_rmap_locks);
diff --git a/mm/rmap.c b/mm/rmap.c
index 53a524b4f948..f10e03da91e3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,21 +25,22 @@
  *     mapping->invalidate_lock (in filemap_fault)
  *       page->flags PG_locked (lock_page)
  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
- *           mapping->i_mmap_rwsem
- *             anon_vma->rwsem
- *               mm->page_table_lock or pte_lock
- *                 swap_lock (in swap_duplicate, swap_info_get)
- *                   mmlist_lock (in mmput, drain_mmlist and others)
- *                   mapping->private_lock (in block_dirty_folio)
- *                     folio_lock_memcg move_lock (in block_dirty_folio)
- *                       i_pages lock (widely used)
- *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
- *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- *                     sb_lock (within inode_lock in fs/fs-writeback.c)
- *                     i_pages lock (widely used, in set_page_dirty,
- *                               in arch-dependent flush_dcache_mmap_lock,
- *                               within bdi.wb->list_lock in __sync_single_inode)
+ *           vma_start_write
+ *             mapping->i_mmap_rwsem
+ *               anon_vma->rwsem
+ *                 mm->page_table_lock or pte_lock
+ *                   swap_lock (in swap_duplicate, swap_info_get)
+ *                     mmlist_lock (in mmput, drain_mmlist and others)
+ *                     mapping->private_lock (in block_dirty_folio)
+ *                       folio_lock_memcg move_lock (in block_dirty_folio)
+ *                         i_pages lock (widely used)
+ *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
+ *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ *                       sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                       i_pages lock (widely used, in set_page_dirty,
+ *                                 in arch-dependent flush_dcache_mmap_lock,
+ *                                 within bdi.wb->list_lock in __sync_single_inode)
  *
  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
  *   ->tasklist_lock
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1ea6a5ce1c41..4f1089a1860e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
 	"direct_map_level2_splits",
 	"direct_map_level3_splits",
 #endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+	"vma_lock_success",
+	"vma_lock_abort",
+	"vma_lock_retry",
+	"vma_lock_miss",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
2.41.0

From 47ae88ccb7e792a154ede54fb157248f8ab83c1d Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Tue, 30 May 2023 17:44:25 +0200
Subject: [PATCH 10/12] sched

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/accounting/psi.rst |   4 +
 arch/x86/kernel/itmt.c           |  23 +-
 arch/x86/kernel/smpboot.c        |   4 +-
 drivers/vhost/vhost.c            |   3 +-
 include/linux/livepatch.h        |   1 +
 include/linux/livepatch_sched.h  |  29 ++
 include/linux/psi.h              |   2 +-
 include/linux/psi_types.h        |  43 +--
 include/linux/sched.h            |  20 +-
 include/linux/sched/sd_flags.h   |   5 +-
 kernel/cgroup/cgroup.c           |  14 +-
 kernel/livepatch/core.c          |   1 +
 kernel/livepatch/transition.c    | 122 ++++++--
 kernel/sched/core.c              | 148 ++++++++--
 kernel/sched/deadline.c          |  17 +-
 kernel/sched/debug.c             |   2 +-
 kernel/sched/fair.c              | 236 ++++++++-------
 kernel/sched/pelt.c              |  60 ++++
 kernel/sched/pelt.h              |  42 ++-
 kernel/sched/psi.c               | 482 +++++++++++++++++--------------
 kernel/sched/rt.c                |  19 ++
 kernel/sched/sched.h             |  34 ++-
 kernel/sched/topology.c          |  11 +-
 23 files changed, 872 insertions(+), 450 deletions(-)
 create mode 100644 include/linux/livepatch_sched.h

diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
index 5e40b3f437f9..df6062eb3abb 100644
--- a/Documentation/accounting/psi.rst
+++ b/Documentation/accounting/psi.rst
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
 after which monitors are most likely not needed and psi averages can be used
 instead.
 
+Unprivileged users can also create monitors, with the only limitation that the
+window size must be a multiple of 2s, in order to prevent excessive resource
+usage.
+
 When activated, psi monitor stays active for at least the duration of one
 tracking window to avoid repeated activations/deactivations when system is
 bouncing in and out of the stall state.
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9ff480e94511..6510883c5e81 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu)
 
 /**
  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio:	Priority of cpu core
- * @core_cpu:	The cpu number associated with the core
+ * @prio:	Priority of @cpu
+ * @cpu:	The CPU number
  *
  * The pstate driver will find out the max boost frequency
  * and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
  * frequency will receive higher priority.
  *
  * No need to rebuild sched domain after updating
  * the CPU priorities. The sched domains have no
  * dependency on CPU priorities.
  */
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
 {
-	int cpu, i = 1;
-
-	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
-		int smt_prio;
-
-		/*
-		 * Ensure that the siblings are moved to the end
-		 * of the priority chain and only used when
-		 * all other high priority cpus are out of capacity.
-		 */
-		smt_prio = prio * smp_num_siblings / (i * i);
-		per_cpu(sched_core_priority, cpu) = smt_prio;
-		i++;
-	}
+	per_cpu(sched_core_priority, cpu) = prio;
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9013bb28255a..cea297d97034 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -547,7 +547,7 @@ static int x86_core_flags(void)
 #ifdef CONFIG_SCHED_SMT
 static int x86_smt_flags(void)
 {
-	return cpu_smt_flags() | x86_sched_itmt_flags();
+	return cpu_smt_flags();
 }
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
@@ -578,7 +578,7 @@ static struct sched_domain_topology_level x86_hybrid_topology[] = {
 #ifdef CONFIG_SCHED_MC
 	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f64efda48f21..157ca18b72ed 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -363,8 +363,7 @@ static int vhost_worker(void *data)
 			kcov_remote_start_common(dev->kcov_handle);
 			work->fn(work);
 			kcov_remote_stop();
-			if (need_resched())
-				schedule();
+			cond_resched();
 		}
 	}
 	kthread_unuse_mm(dev->mm);
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 293e29960c6e..9b9b38e89563 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
 #include <linux/ftrace.h>
 #include <linux/completion.h>
 #include <linux/list.h>
+#include <linux/livepatch_sched.h>
 
 #if IS_ENABLED(CONFIG_LIVEPATCH)
 
diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h
new file mode 100644
index 000000000000..013794fb5da0
--- /dev/null
+++ b/include/linux/livepatch_sched.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_LIVEPATCH_SCHED_H_
+#define _LINUX_LIVEPATCH_SCHED_H_
+
+#include <linux/jump_label.h>
+#include <linux/static_call_types.h>
+
+#ifdef CONFIG_LIVEPATCH
+
+void __klp_sched_try_switch(void);
+
+#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+static __always_inline void klp_sched_try_switch(void)
+{
+	if (static_branch_unlikely(&klp_sched_try_switch_key))
+		__klp_sched_try_switch();
+}
+
+#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+#else /* !CONFIG_LIVEPATCH */
+static inline void klp_sched_try_switch(void) {}
+static inline void __klp_sched_try_switch(void) {}
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
diff --git a/include/linux/psi.h b/include/linux/psi.h
index b029a847def1..ab26200c2803 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, enum psi_res res);
+			char *buf, enum psi_res res, struct file *file);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 1e0a0d7ace3a..040c089581c6 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -151,6 +151,9 @@ struct psi_trigger {
 
 	/* Deferred event(s) from previous ratelimit window */
 	bool pending_event;
+
+	/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
+	enum psi_aggregators aggregator;
 };
 
 struct psi_group {
@@ -171,30 +174,34 @@ struct psi_group {
 	/* Aggregator work control */
 	struct delayed_work avgs_work;
 
+	/* Unprivileged triggers against N*PSI_FREQ windows */
+	struct list_head avg_triggers;
+	u32 avg_nr_triggers[NR_PSI_STATES - 1];
+
 	/* Total stall times and sampled pressure averages */
 	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
 	unsigned long avg[NR_PSI_STATES - 1][3];
 
-	/* Monitor work control */
-	struct task_struct __rcu *poll_task;
-	struct timer_list poll_timer;
-	wait_queue_head_t poll_wait;
-	atomic_t poll_wakeup;
-	atomic_t poll_scheduled;
+	/* Monitor RT polling work control */
+	struct task_struct __rcu *rtpoll_task;
+	struct timer_list rtpoll_timer;
+	wait_queue_head_t rtpoll_wait;
+	atomic_t rtpoll_wakeup;
+	atomic_t rtpoll_scheduled;
 
 	/* Protects data used by the monitor */
-	struct mutex trigger_lock;
-
-	/* Configured polling triggers */
-	struct list_head triggers;
-	u32 nr_triggers[NR_PSI_STATES - 1];
-	u32 poll_states;
-	u64 poll_min_period;
-
-	/* Total stall times at the start of monitor activation */
-	u64 polling_total[NR_PSI_STATES - 1];
-	u64 polling_next_update;
-	u64 polling_until;
+	struct mutex rtpoll_trigger_lock;
+
+	/* Configured RT polling triggers */
+	struct list_head rtpoll_triggers;
+	u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
+	u32 rtpoll_states;
+	u64 rtpoll_min_period;
+
+	/* Total stall times at the start of RT polling monitor activation */
+	u64 rtpoll_total[NR_PSI_STATES - 1];
+	u64 rtpoll_next_update;
+	u64 rtpoll_until;
 };
 
 #else /* CONFIG_PSI */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..6d654eb4cabd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
+#include <linux/livepatch_sched.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -2070,6 +2071,9 @@ extern int __cond_resched(void);
 
 #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 
+void sched_dynamic_klp_enable(void);
+void sched_dynamic_klp_disable(void);
+
 DECLARE_STATIC_CALL(cond_resched, __cond_resched);
 
 static __always_inline int _cond_resched(void)
@@ -2078,6 +2082,7 @@ static __always_inline int _cond_resched(void)
 }
 
 #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
 extern int dynamic_cond_resched(void);
 
 static __always_inline int _cond_resched(void)
@@ -2085,20 +2090,25 @@ static __always_inline int _cond_resched(void)
 	return dynamic_cond_resched();
 }
 
-#else
+#else /* !CONFIG_PREEMPTION */
 
 static inline int _cond_resched(void)
 {
+	klp_sched_try_switch();
 	return __cond_resched();
 }
 
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
 
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
 
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+	klp_sched_try_switch();
+	return 0;
+}
 
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
 
 #define cond_resched() ({			\
 	__might_resched(__FILE__, __LINE__, 0);	\
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 57bde66d95f7..fad77b5172e2 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
 /*
  * Place busy tasks earlier in the domain
  *
- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
- *               up, but currently assumed to be set from the base domain
- *               upwards (see update_top_cache_domain()).
  * NEEDS_GROUPS: Load balancing flag.
  */
-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
 
 /*
  * Prefer to place tasks in a sibling domain
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index fd57c1dca1cc..a2c96b2294d7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3761,7 +3761,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
 	}
 
 	psi = cgroup_psi(cgrp);
-	new = psi_trigger_create(psi, buf, res);
+	new = psi_trigger_create(psi, buf, res, of->file);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
 		return PTR_ERR(new);
@@ -3867,6 +3867,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
 	return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
 }
 
+static int cgroup_pressure_open(struct kernfs_open_file *of)
+{
+	if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	return 0;
+}
+
 static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
@@ -5266,6 +5274,7 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "io.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
+		.open = cgroup_pressure_open,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5274,6 +5283,7 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "memory.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
+		.open = cgroup_pressure_open,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5282,6 +5292,7 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "cpu.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
+		.open = cgroup_pressure_open,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5291,6 +5302,7 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "irq.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+		.open = cgroup_pressure_open,
 		.seq_show = cgroup_irq_pressure_show,
 		.write = cgroup_irq_pressure_write,
 		.poll = cgroup_pressure_poll,
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4bd2d5e10f20..eea7c8ec6e05 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
  *
  * - klp_ftrace_handler()
  * - klp_update_patch_state()
+ * - __klp_sched_try_switch()
  */
 DEFINE_MUTEX(klp_mutex);
 
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f1b25ec581e0..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,11 +9,14 @@
 
 #include <linux/cpu.h>
 #include <linux/stacktrace.h>
+#include <linux/static_call.h>
 #include "core.h"
 #include "patch.h"
 #include "transition.h"
 
 #define MAX_STACK_ENTRIES  100
+DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
 #define STACK_ERR_BUF_SIZE 128
 
 #define SIGNALS_TIMEOUT 15
@@ -24,6 +27,25 @@ static int klp_target_state = KLP_UNDEFINED;
 
 static unsigned int klp_signals_cnt;
 
+/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched().  This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
 /*
  * This work can be performed periodically to finish patching or unpatching any
  * "straggler" tasks which failed to transition in the first attempt.
@@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
 	 * barrier (smp_rmb) for two cases:
 	 *
 	 * 1) Enforce the order of the TIF_PATCH_PENDING read and the
-	 *    klp_target_state read.  The corresponding write barrier is in
-	 *    klp_init_transition().
+	 *    klp_target_state read.  The corresponding write barriers are in
+	 *    klp_init_transition() and klp_reverse_transition().
 	 *
 	 * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
 	 *    of func->transition, if klp_ftrace_handler() is called later on
@@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
  */
 static int klp_check_stack(struct task_struct *task, const char **oldname)
 {
-	static unsigned long entries[MAX_STACK_ENTRIES];
+	unsigned long *entries = this_cpu_ptr(klp_stack_entries);
 	struct klp_object *obj;
 	struct klp_func *func;
 	int ret, nr_entries;
 
-	ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+	/* Protect 'klp_stack_entries' */
+	lockdep_assert_preemption_disabled();
+
+	ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
 	if (ret < 0)
 		return -EINVAL;
 	nr_entries = ret;
@@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task)
 	 * functions.  If all goes well, switch the task to the target patch
 	 * state.
 	 */
-	ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+	if (task == current)
+		ret = klp_check_and_switch_task(current, &old_name);
+	else
+		ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
 	switch (ret) {
 	case 0:		/* success */
 		break;
@@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
 	return !ret;
 }
 
+void __klp_sched_try_switch(void)
+{
+	if (likely(!klp_patch_pending(current)))
+		return;
+
+	/*
+	 * This function is called from cond_resched() which is called in many
+	 * places throughout the kernel.  Using the klp_mutex here might
+	 * deadlock.
+	 *
+	 * Instead, disable preemption to prevent racing with other callers of
+	 * klp_try_switch_task().  Thanks to task_call_func() they won't be
+	 * able to switch this task while it's running.
+	 */
+	preempt_disable();
+
+	/*
+	 * Make sure current didn't get patched between the above check and
+	 * preempt_disable().
+	 */
+	if (unlikely(!klp_patch_pending(current)))
+		goto out;
+
+	/*
+	 * Enforce the order of the TIF_PATCH_PENDING read above and the
+	 * klp_target_state read in klp_try_switch_task().  The corresponding
+	 * write barriers are in klp_init_transition() and
+	 * klp_reverse_transition().
+	 */
+	smp_rmb();
+
+	klp_try_switch_task(current);
+
+out:
+	preempt_enable();
+}
+EXPORT_SYMBOL(__klp_sched_try_switch);
+
 /*
  * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
  * Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
 		return;
 	}
 
-	/* we're done, now cleanup the data structures */
+	/* Done!  Now cleanup the data structures. */
+	klp_cond_resched_disable();
 	patch = klp_transition_patch;
 	klp_complete_transition();
 
@@ -492,6 +560,8 @@ void klp_start_transition(void)
 			set_tsk_thread_flag(task, TIF_PATCH_PENDING);
 	}
 
+	klp_cond_resched_enable();
+
 	klp_signals_cnt = 0;
 }
 
@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 * see a func in transition with a task->patch_state of KLP_UNDEFINED.
 	 *
 	 * Also enforce the order of the klp_target_state write and future
-	 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
-	 * set a task->patch_state to KLP_UNDEFINED.
+	 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+	 * __klp_sched_try_switch() don't set a task->patch_state to
+	 * KLP_UNDEFINED.
 	 */
 	smp_wmb();
 
@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
 		 klp_target_state == KLP_PATCHED ? "patching to unpatching" :
 						   "unpatching to patching");
 
-	klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
-	klp_target_state = !klp_target_state;
-
 	/*
 	 * Clear all TIF_PATCH_PENDING flags to prevent races caused by
-	 * klp_update_patch_state() running in parallel with
-	 * klp_start_transition().
+	 * klp_update_patch_state() or __klp_sched_try_switch() running in
+	 * parallel with the reverse transition.
 	 */
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, task)
@@ -601,9 +668,28 @@ void klp_reverse_transition(void)
 	for_each_possible_cpu(cpu)
 		clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
 
-	/* Let any remaining calls to klp_update_patch_state() complete */
+	/*
+	 * Make sure all existing invocations of klp_update_patch_state() and
+	 * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+	 * starting the reverse transition.
+	 */
 	klp_synchronize_transition();
 
+	/*
+	 * All patching has stopped, now re-initialize the global variables to
+	 * prepare for the reverse transition.
+	 */
+	klp_transition_patch->enabled = !klp_transition_patch->enabled;
+	klp_target_state = !klp_target_state;
+
+	/*
+	 * Enforce the order of the klp_target_state write and the
+	 * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+	 * klp_update_patch_state() and __klp_sched_try_switch() don't set
+	 * task->patch_state to the wrong value.
+	 */
+	smp_wmb();
+
 	klp_start_transition();
 }
 
@@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
 	 * the task flag up to date with the parent here.
 	 *
 	 * The operation is serialized against all klp_*_transition()
-	 * operations by the tasklist_lock. The only exception is
-	 * klp_update_patch_state(current), but we cannot race with
-	 * that because we are current.
+	 * operations by the tasklist_lock. The only exceptions are
+	 * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+	 * cannot race with them because we are current.
 	 */
 	if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
 		set_tsk_thread_flag(child, TIF_PATCH_PENDING);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..c1a88c49642a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -261,36 +261,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
 		resched_curr(rq);
 }
 
-/*
- * Find left-most (aka, highest priority) task matching @cookie.
- */
-static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
 {
-	struct rb_node *node;
-
-	node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
-	/*
-	 * The idle task always matches any cookie!
-	 */
-	if (!node)
-		return idle_sched_class.pick_task(rq);
+	if (p->sched_class->task_is_throttled)
+		return p->sched_class->task_is_throttled(p, cpu);
 
-	return __node_2_sc(node);
+	return 0;
 }
 
 static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
 {
 	struct rb_node *node = &p->core_node;
+	int cpu = task_cpu(p);
+
+	do {
+		node = rb_next(node);
+		if (!node)
+			return NULL;
+
+		p = __node_2_sc(node);
+		if (p->core_cookie != cookie)
+			return NULL;
+
+	} while (sched_task_is_throttled(p, cpu));
+
+	return p;
+}
 
-	node = rb_next(node);
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+	struct task_struct *p;
+	struct rb_node *node;
+
+	node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
 	if (!node)
 		return NULL;
 
-	p = container_of(node, struct task_struct, core_node);
-	if (p->core_cookie != cookie)
-		return NULL;
+	p = __node_2_sc(node);
+	if (!sched_task_is_throttled(p, rq->cpu))
+		return p;
 
-	return p;
+	return sched_core_next(p, cookie);
 }
 
 /*
@@ -724,7 +739,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		update_irq_load_avg(rq, irq_delta + steal);
 #endif
-	update_rq_clock_pelt(rq, delta);
+	update_rq_clock_task_mult(rq, delta);
 }
 
 void update_rq_clock(struct rq *rq)
@@ -6239,7 +6254,7 @@ static bool try_steal_cookie(int this, int that)
 		goto unlock;
 
 	p = sched_core_find(src, cookie);
-	if (p == src->idle)
+	if (!p)
 		goto unlock;
 
 	do {
@@ -6251,6 +6266,13 @@ static bool try_steal_cookie(int this, int that)
 
 		if (p->core_occupation > dst->idle->core_occupation)
 			goto next;
+		/*
+		 * sched_core_find() and sched_core_next() will ensure that task @p
+		 * is not throttled now, we also need to check whether the runqueue
+		 * of the destination CPU is being throttled.
+		 */
+		if (sched_task_is_throttled(p, this))
+			goto next;
 
 		deactivate_task(src, p, 0);
 		set_task_cpu(p, this);
@@ -8506,6 +8528,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
 int __sched dynamic_cond_resched(void)
 {
+	klp_sched_try_switch();
 	if (!static_branch_unlikely(&sk_dynamic_cond_resched))
 		return 0;
 	return __cond_resched();
@@ -8654,13 +8677,17 @@ int sched_dynamic_mode(const char *str)
 #error "Unsupported PREEMPT_DYNAMIC mechanism"
 #endif
 
-void sched_dynamic_update(int mode)
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
 {
 	/*
 	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
 	 * the ZERO state, which is invalid.
 	 */
-	preempt_dynamic_enable(cond_resched);
+	if (!klp_override)
+		preempt_dynamic_enable(cond_resched);
 	preempt_dynamic_enable(might_resched);
 	preempt_dynamic_enable(preempt_schedule);
 	preempt_dynamic_enable(preempt_schedule_notrace);
@@ -8668,36 +8695,79 @@ void sched_dynamic_update(int mode)
 
 	switch (mode) {
 	case preempt_dynamic_none:
-		preempt_dynamic_enable(cond_resched);
+		if (!klp_override)
+			preempt_dynamic_enable(cond_resched);
 		preempt_dynamic_disable(might_resched);
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: none\n");
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: none\n");
 		break;
 
 	case preempt_dynamic_voluntary:
-		preempt_dynamic_enable(cond_resched);
+		if (!klp_override)
+			preempt_dynamic_enable(cond_resched);
 		preempt_dynamic_enable(might_resched);
 		preempt_dynamic_disable(preempt_schedule);
 		preempt_dynamic_disable(preempt_schedule_notrace);
 		preempt_dynamic_disable(irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: voluntary\n");
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: voluntary\n");
 		break;
 
 	case preempt_dynamic_full:
-		preempt_dynamic_disable(cond_resched);
+		if (!klp_override)
+			preempt_dynamic_disable(cond_resched);
 		preempt_dynamic_disable(might_resched);
 		preempt_dynamic_enable(preempt_schedule);
 		preempt_dynamic_enable(preempt_schedule_notrace);
 		preempt_dynamic_enable(irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: full\n");
+		if (mode != preempt_dynamic_mode)
+			pr_info("Dynamic Preempt: full\n");
 		break;
 	}
 
 	preempt_dynamic_mode = mode;
 }
 
+void sched_dynamic_update(int mode)
+{
+	mutex_lock(&sched_dynamic_mutex);
+	__sched_dynamic_update(mode);
+	mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+	__klp_sched_try_switch();
+	return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+	mutex_lock(&sched_dynamic_mutex);
+
+	klp_override = true;
+	static_call_update(cond_resched, klp_cond_resched);
+
+	mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+	mutex_lock(&sched_dynamic_mutex);
+
+	klp_override = false;
+	__sched_dynamic_update(preempt_dynamic_mode);
+
+	mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
 static int __init setup_preempt_mode(char *str)
 {
 	int mode = sched_dynamic_mode(str);
@@ -10332,7 +10402,7 @@ void sched_release_group(struct task_group *tg)
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
-static void sched_change_group(struct task_struct *tsk)
+static struct task_group *sched_get_task_group(struct task_struct *tsk)
 {
 	struct task_group *tg;
 
@@ -10344,7 +10414,13 @@ static void sched_change_group(struct task_struct *tsk)
 	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
-	tsk->sched_task_group = tg;
+
+	return tg;
+}
+
+static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+{
+	tsk->sched_task_group = group;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_change_group)
@@ -10365,10 +10441,19 @@ void sched_move_task(struct task_struct *tsk)
 {
 	int queued, running, queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+	struct task_group *group;
 	struct rq_flags rf;
 	struct rq *rq;
 
 	rq = task_rq_lock(tsk, &rf);
+	/*
+	 * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+	 * group changes.
+	 */
+	group = sched_get_task_group(tsk);
+	if (group == tsk->sched_task_group)
+		goto unlock;
+
 	update_rq_clock(rq);
 
 	running = task_current(rq, tsk);
@@ -10379,7 +10464,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (running)
 		put_prev_task(rq, tsk);
 
-	sched_change_group(tsk);
+	sched_change_group(tsk, group);
 
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
@@ -10393,6 +10478,7 @@ void sched_move_task(struct task_struct *tsk)
 		resched_curr(rq);
 	}
 
+unlock:
 	task_rq_unlock(rq, tsk, &rf);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ac8010f6f3a2..f827067ad03b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -489,13 +489,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
 
 static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
 
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
-	raw_spin_lock_init(&dl_b->dl_runtime_lock);
-	dl_b->dl_period = period;
-	dl_b->dl_runtime = runtime;
-}
-
 void init_dl_bw(struct dl_bw *dl_b)
 {
 	raw_spin_lock_init(&dl_b->lock);
@@ -2705,6 +2698,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 #endif
 }
 
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+	return p->dl.dl_throttled;
+}
+#endif
+
 DEFINE_SCHED_CLASS(dl) = {
 
 	.enqueue_task		= enqueue_task_dl,
@@ -2737,6 +2737,9 @@ DEFINE_SCHED_CLASS(dl) = {
 	.switched_to		= switched_to_dl,
 
 	.update_curr		= update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+	.task_is_throttled	= task_is_throttled_dl,
+#endif
 };
 
 /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..18e1500e698d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -731,7 +731,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 #define P(x)								\
 do {									\
 	if (sizeof(rq->x) == 4)						\
-		SEQ_printf(m, "  .%-30s: %ld\n", #x, (long)(rq->x));	\
+		SEQ_printf(m, "  .%-30s: %d\n", #x, (int)(rq->x));	\
 	else								\
 		SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x));\
 } while (0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fce570ff3673..bf803fe5c19b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+	int sibling;
+
+	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+		if (cpu == sibling)
+			continue;
+
+		if (!idle_cpu(sibling))
+			return false;
+	}
+#endif
+
+	return true;
+}
+
 #ifdef CONFIG_NUMA
 #define NUMA_IMBALANCE_MIN 2
 
@@ -1718,23 +1735,6 @@ struct numa_stats {
 	int idle_cpu;
 };
 
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	int sibling;
-
-	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
-		if (cpu == sibling)
-			continue;
-
-		if (!idle_cpu(sibling))
-			return false;
-	}
-#endif
-
-	return true;
-}
-
 struct task_numa_env {
 	struct task_struct *p;
 
@@ -5977,6 +5977,10 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
+
+	/* Add a random offset so that timers interleave */
+	hrtimer_set_expires(&cfs_b->period_timer,
+			    get_random_u32_below(cfs_b->period));
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 	cfs_b->slack_started = false;
@@ -9288,96 +9292,61 @@ group_type group_classify(unsigned int imbalance_pct,
 }
 
 /**
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
- * @dst_cpu:	Destination CPU of the load balancing
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd:		The scheduling domain of the load balancing
+ * @cpu:	A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+	if (!sched_smt_active())
+		return true;
+
+	return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env:	The load balancing environment
  * @sds:	Load-balancing data with statistics of the local group
  * @sgs:	Load-balancing statistics of the candidate busiest group
- * @sg:		The candidate busiest group
- *
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * @group:	The candidate busiest group
  *
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
  *
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
  *
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
+ * If we are balancing load within an SMT core, or at DIE domain level, always
+ * proceed.
  *
- * Return: true if @dst_cpu can pull tasks, false otherwise.
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
  */
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
-				    struct sg_lb_stats *sgs,
-				    struct sched_group *sg)
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
+	   struct sched_group *group)
 {
-#ifdef CONFIG_SCHED_SMT
-	bool local_is_smt, sg_is_smt;
-	int sg_busy_cpus;
-
-	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
-	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
-	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
-
-	if (!local_is_smt) {
-		/*
-		 * If we are here, @dst_cpu is idle and does not have SMT
-		 * siblings. Pull tasks if candidate group has two or more
-		 * busy CPUs.
-		 */
-		if (sg_busy_cpus >= 2) /* implies sg_is_smt */
-			return true;
-
-		/*
-		 * @dst_cpu does not have SMT siblings. @sg may have SMT
-		 * siblings and only one is busy. In such case, @dst_cpu
-		 * can help if it has higher priority and is idle (i.e.,
-		 * it has no running tasks).
-		 */
-		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-	}
-
-	/* @dst_cpu has SMT siblings. */
-
-	if (sg_is_smt) {
-		int local_busy_cpus = sds->local->group_weight -
-				      sds->local_stat.idle_cpus;
-		int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
-		if (busy_cpus_delta == 1)
-			return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
+	/* Ensure that the whole local core is idle, if applicable. */
+	if (!sched_use_asym_prio(env->sd, env->dst_cpu))
 		return false;
-	}
 
 	/*
-	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
-	 * up with more than one busy SMT sibling and only pull tasks if there
-	 * are not busy CPUs (i.e., no CPU has running tasks).
+	 * CPU priorities does not make sense for SMT cores with more than one
+	 * busy sibling.
 	 */
-	if (!sds->local_stat.sum_nr_running)
-		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
-	return false;
-#else
-	/* Always return false so that callers deal with non-SMT cases. */
-	return false;
-#endif
-}
-
-static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
-	   struct sched_group *group)
-{
-	/* Only do SMT checks if either local or candidate have SMT siblings */
-	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
-	    (group->flags & SD_SHARE_CPUCAPACITY))
-		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+	if (group->flags & SD_SHARE_CPUCAPACITY) {
+		if (sgs->group_weight - sgs->idle_cpus != 1)
+			return false;
+	}
 
 	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
 }
@@ -9567,10 +9536,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		 * contention when accessing shared HW resources.
 		 *
 		 * XXX for now avg_load is not computed and always 0 so we
-		 * select the 1st one.
+		 * select the 1st one, except if @sg is composed of SMT
+		 * siblings.
 		 */
-		if (sgs->avg_load <= busiest->avg_load)
+
+		if (sgs->avg_load < busiest->avg_load)
 			return false;
+
+		if (sgs->avg_load == busiest->avg_load) {
+			/*
+			 * SMT sched groups need more help than non-SMT groups.
+			 * If @sg happens to also be SMT, either choice is good.
+			 */
+			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+				return false;
+		}
+
 		break;
 
 	case group_has_spare:
@@ -10045,7 +10026,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
 
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
@@ -10086,8 +10066,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
-	/* Tag domain that child domain prefers tasks go to siblings first */
-	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+	/*
+	 * Indicate that the child domain of the busiest group prefers tasks
+	 * go to a child's sibling domains first. NB the flags of a sched group
+	 * are those of the child domain.
+	 */
+	if (sds->busiest)
+		sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
 
 
 	if (env->sd->flags & SD_NUMA)
@@ -10397,7 +10382,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 			goto out_balanced;
 	}
 
-	/* Try to move all excess tasks to child's sibling domain */
+	/*
+	 * Try to move all excess tasks to a sibling domain of the busiest
+	 * group's child domain.
+	 */
 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
 	    busiest->sum_nr_running > local->sum_nr_running + 1)
 		goto force_balance;
@@ -10499,8 +10487,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		    nr_running == 1)
 			continue;
 
-		/* Make sure we only pull tasks from a CPU of lower priority */
+		/*
+		 * Make sure we only pull tasks from a CPU of lower priority
+		 * when balancing between SMT siblings.
+		 *
+		 * If balancing between cores, let lower priority CPUs help
+		 * SMT cores with more than one busy sibling.
+		 */
 		if ((env->sd->flags & SD_ASYM_PACKING) &&
+		    sched_use_asym_prio(env->sd, i) &&
 		    sched_asym_prefer(i, env->dst_cpu) &&
 		    nr_running == 1)
 			continue;
@@ -10589,12 +10584,19 @@ static inline bool
 asym_active_balance(struct lb_env *env)
 {
 	/*
-	 * ASYM_PACKING needs to force migrate tasks from busy but
-	 * lower priority CPUs in order to pack all tasks in the
-	 * highest priority CPUs.
+	 * ASYM_PACKING needs to force migrate tasks from busy but lower
+	 * priority CPUs in order to pack all tasks in the highest priority
+	 * CPUs. When done between cores, do it only if the whole core if the
+	 * whole core is idle.
+	 *
+	 * If @env::src_cpu is an SMT core with busy siblings, let
+	 * the lower priority @env::dst_cpu help it. Do not follow
+	 * CPU priority.
 	 */
 	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
-	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
+	       sched_use_asym_prio(env->sd, env->dst_cpu) &&
+	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+		!sched_use_asym_prio(env->sd, env->src_cpu));
 }
 
 static inline bool
@@ -11328,9 +11330,13 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * When ASYM_PACKING; see if there's a more preferred CPU
 		 * currently idle; in which case, kick the ILB to move tasks
 		 * around.
+		 *
+		 * When balancing betwen cores, all the SMT siblings of the
+		 * preferred CPU must be idle.
 		 */
 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
-			if (sched_asym_prefer(i, cpu)) {
+			if (sched_use_asym_prio(sd, i) &&
+			    sched_asym_prefer(i, cpu)) {
 				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 				goto unlock;
 			}
@@ -11994,6 +12000,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
 
 	return delta > 0;
 }
+
+static int task_is_throttled_fair(struct task_struct *p, int cpu)
+{
+	struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq = task_group(p)->cfs_rq[cpu];
+#else
+	cfs_rq = &cpu_rq(cpu)->cfs;
+#endif
+	return throttled_hierarchy(cfs_rq);
+}
 #else
 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
 #endif
@@ -12620,6 +12638,10 @@ DEFINE_SCHED_CLASS(fair) = {
 	.task_change_group	= task_change_group_fair,
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+	.task_is_throttled	= task_is_throttled_fair,
+#endif
+
 #ifdef CONFIG_UCLAMP_TASK
 	.uclamp_enabled		= 1,
 #endif
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..036b0e2cd2b4 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -467,3 +467,63 @@ int update_irq_load_avg(struct rq *rq, u64 running)
 	return ret;
 }
 #endif
+
+__read_mostly unsigned int sched_pelt_lshift;
+
+#ifdef CONFIG_SYSCTL
+static unsigned int sysctl_sched_pelt_multiplier = 1;
+
+int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	unsigned int old;
+	int ret;
+
+	mutex_lock(&mutex);
+	old = sysctl_sched_pelt_multiplier;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto undo;
+	if (!write)
+		goto done;
+
+	switch (sysctl_sched_pelt_multiplier)  {
+	case 1:
+		fallthrough;
+	case 2:
+		fallthrough;
+	case 4:
+		WRITE_ONCE(sched_pelt_lshift,
+			   sysctl_sched_pelt_multiplier >> 1);
+		goto done;
+	default:
+		ret = -EINVAL;
+	}
+
+undo:
+	sysctl_sched_pelt_multiplier = old;
+done:
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+
+static struct ctl_table sched_pelt_sysctls[] = {
+	{
+		.procname       = "sched_pelt_multiplier",
+		.data           = &sysctl_sched_pelt_multiplier,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_pelt_multiplier,
+	},
+	{}
+};
+
+static int __init sched_pelt_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_pelt_sysctls);
+	return 0;
+}
+late_initcall(sched_pelt_sysctl_init);
+#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 3a0e0dc28721..9b35b5072bae 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 }
 
+static inline u64 rq_clock_task_mult(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+	assert_clock_updated(rq);
+
+	return rq->clock_task_mult;
+}
+
 static inline u64 rq_clock_pelt(struct rq *rq)
 {
 	lockdep_assert_rq_held(rq);
@@ -72,7 +80,7 @@ static inline u64 rq_clock_pelt(struct rq *rq)
 /* The rq is idle, we can sync to clock_task */
 static inline void _update_idle_rq_clock_pelt(struct rq *rq)
 {
-	rq->clock_pelt  = rq_clock_task(rq);
+	rq->clock_pelt = rq_clock_task_mult(rq);
 
 	u64_u32_store(rq->clock_idle, rq_clock(rq));
 	/* Paired with smp_rmb in migrate_se_pelt_lag() */
@@ -121,6 +129,27 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 	rq->clock_pelt += delta;
 }
 
+extern unsigned int sched_pelt_lshift;
+
+/*
+ * absolute time   |1      |2      |3      |4      |5      |6      |
+ * @ mult = 1      --------****************--------****************-
+ * @ mult = 2      --------********----------------********---------
+ * @ mult = 4      --------****--------------------****-------------
+ * clock task mult
+ * @ mult = 2      |   |   |2  |3  |   |   |   |   |5  |6  |   |   |
+ * @ mult = 4      | | | | |2|3| | | | | | | | | | |5|6| | | | | | |
+ *
+ */
+static inline void update_rq_clock_task_mult(struct rq *rq, s64 delta)
+{
+	delta <<= READ_ONCE(sched_pelt_lshift);
+
+	rq->clock_task_mult += delta;
+
+	update_rq_clock_pelt(rq, delta);
+}
+
 /*
  * When rq becomes idle, we have to check if it has lost idle time
  * because it was fully busy. A rq is fully used when the /Sum util_sum
@@ -147,7 +176,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
 	 * rq's clock_task.
 	 */
 	if (util_sum >= divider)
-		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+		rq->lost_idle_time += rq_clock_task_mult(rq) - rq->clock_pelt;
 
 	_update_idle_rq_clock_pelt(rq);
 }
@@ -218,13 +247,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
 	return 0;
 }
 
-static inline u64 rq_clock_pelt(struct rq *rq)
+static inline u64 rq_clock_task_mult(struct rq *rq)
 {
 	return rq_clock_task(rq);
 }
 
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+	return rq_clock_task_mult(rq);
+}
+
 static inline void
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+update_rq_clock_task_mult(struct rq *rq, s64 delta) { }
 
 static inline void
 update_idle_rq_clock_pelt(struct rq *rq) { }
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 02e011cabe91..81fca77397f6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -160,7 +160,6 @@ __setup("psi=", setup_psi);
 #define EXP_300s	2034		/* 1/exp(2s/300s) */
 
 /* PSI trigger definitions */
-#define WINDOW_MIN_US 500000	/* Min window size is 500ms */
 #define WINDOW_MAX_US 10000000	/* Max window size is 10s */
 #define UPDATES_PER_WINDOW 10	/* 10 updates per window */
 
@@ -186,17 +185,22 @@ static void group_init(struct psi_group *group)
 		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
 	group->avg_last_update = sched_clock();
 	group->avg_next_update = group->avg_last_update + psi_period;
-	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
-	/* Init trigger-related members */
-	atomic_set(&group->poll_scheduled, 0);
-	mutex_init(&group->trigger_lock);
-	INIT_LIST_HEAD(&group->triggers);
-	group->poll_min_period = U32_MAX;
-	group->polling_next_update = ULLONG_MAX;
-	init_waitqueue_head(&group->poll_wait);
-	timer_setup(&group->poll_timer, poll_timer_fn, 0);
-	rcu_assign_pointer(group->poll_task, NULL);
+
+	/* Init avg trigger-related members */
+	INIT_LIST_HEAD(&group->avg_triggers);
+	memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+
+	/* Init rtpoll trigger-related members */
+	atomic_set(&group->rtpoll_scheduled, 0);
+	mutex_init(&group->rtpoll_trigger_lock);
+	INIT_LIST_HEAD(&group->rtpoll_triggers);
+	group->rtpoll_min_period = U32_MAX;
+	group->rtpoll_next_update = ULLONG_MAX;
+	init_waitqueue_head(&group->rtpoll_wait);
+	timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
+	rcu_assign_pointer(group->rtpoll_task, NULL);
 }
 
 void __init psi_init(void)
@@ -384,92 +388,6 @@ static void collect_percpu_times(struct psi_group *group,
 		*pchanged_states = changed_states;
 }
 
-static u64 update_averages(struct psi_group *group, u64 now)
-{
-	unsigned long missed_periods = 0;
-	u64 expires, period;
-	u64 avg_next_update;
-	int s;
-
-	/* avgX= */
-	expires = group->avg_next_update;
-	if (now - expires >= psi_period)
-		missed_periods = div_u64(now - expires, psi_period);
-
-	/*
-	 * The periodic clock tick can get delayed for various
-	 * reasons, especially on loaded systems. To avoid clock
-	 * drift, we schedule the clock in fixed psi_period intervals.
-	 * But the deltas we sample out of the per-cpu buckets above
-	 * are based on the actual time elapsing between clock ticks.
-	 */
-	avg_next_update = expires + ((1 + missed_periods) * psi_period);
-	period = now - (group->avg_last_update + (missed_periods * psi_period));
-	group->avg_last_update = now;
-
-	for (s = 0; s < NR_PSI_STATES - 1; s++) {
-		u32 sample;
-
-		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
-		/*
-		 * Due to the lockless sampling of the time buckets,
-		 * recorded time deltas can slip into the next period,
-		 * which under full pressure can result in samples in
-		 * excess of the period length.
-		 *
-		 * We don't want to report non-sensical pressures in
-		 * excess of 100%, nor do we want to drop such events
-		 * on the floor. Instead we punt any overage into the
-		 * future until pressure subsides. By doing this we
-		 * don't underreport the occurring pressure curve, we
-		 * just report it delayed by one period length.
-		 *
-		 * The error isn't cumulative. As soon as another
-		 * delta slips from a period P to P+1, by definition
-		 * it frees up its time T in P.
-		 */
-		if (sample > period)
-			sample = period;
-		group->avg_total[s] += sample;
-		calc_avgs(group->avg[s], missed_periods, sample, period);
-	}
-
-	return avg_next_update;
-}
-
-static void psi_avgs_work(struct work_struct *work)
-{
-	struct delayed_work *dwork;
-	struct psi_group *group;
-	u32 changed_states;
-	u64 now;
-
-	dwork = to_delayed_work(work);
-	group = container_of(dwork, struct psi_group, avgs_work);
-
-	mutex_lock(&group->avgs_lock);
-
-	now = sched_clock();
-
-	collect_percpu_times(group, PSI_AVGS, &changed_states);
-	/*
-	 * If there is task activity, periodically fold the per-cpu
-	 * times and feed samples into the running averages. If things
-	 * are idle and there is no data to process, stop the clock.
-	 * Once restarted, we'll catch up the running averages in one
-	 * go - see calc_avgs() and missed_periods.
-	 */
-	if (now >= group->avg_next_update)
-		group->avg_next_update = update_averages(group, now);
-
-	if (changed_states & PSI_STATE_RESCHEDULE) {
-		schedule_delayed_work(dwork, nsecs_to_jiffies(
-				group->avg_next_update - now) + 1);
-	}
-
-	mutex_unlock(&group->avgs_lock);
-}
-
 /* Trigger tracking window manipulations */
 static void window_reset(struct psi_window *win, u64 now, u64 value,
 			 u64 prev_growth)
@@ -516,33 +434,32 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
 	return growth;
 }
 
-static void init_triggers(struct psi_group *group, u64 now)
+static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+						   enum psi_aggregators aggregator)
 {
 	struct psi_trigger *t;
-
-	list_for_each_entry(t, &group->triggers, node)
-		window_reset(&t->win, now,
-				group->total[PSI_POLL][t->state], 0);
-	memcpy(group->polling_total, group->total[PSI_POLL],
-		   sizeof(group->polling_total));
-	group->polling_next_update = now + group->poll_min_period;
-}
-
-static u64 update_triggers(struct psi_group *group, u64 now)
-{
-	struct psi_trigger *t;
-	bool update_total = false;
-	u64 *total = group->total[PSI_POLL];
+	u64 *total = group->total[aggregator];
+	struct list_head *triggers;
+	u64 *aggregator_total;
+	*update_total = false;
+
+	if (aggregator == PSI_AVGS) {
+		triggers = &group->avg_triggers;
+		aggregator_total = group->avg_total;
+	} else {
+		triggers = &group->rtpoll_triggers;
+		aggregator_total = group->rtpoll_total;
+	}
 
 	/*
 	 * On subsequent updates, calculate growth deltas and let
 	 * watchers know when their specified thresholds are exceeded.
 	 */
-	list_for_each_entry(t, &group->triggers, node) {
+	list_for_each_entry(t, triggers, node) {
 		u64 growth;
 		bool new_stall;
 
-		new_stall = group->polling_total[t->state] != total[t->state];
+		new_stall = aggregator_total[t->state] != total[t->state];
 
 		/* Check for stall activity or a previous threshold breach */
 		if (!new_stall && !t->pending_event)
@@ -560,7 +477,7 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 			 * been through all of them. Also remember to extend the
 			 * polling time if we see new stall activity.
 			 */
-			update_total = true;
+			*update_total = true;
 
 			/* Calculate growth since last update */
 			growth = window_update(&t->win, now, total[t->state]);
@@ -583,52 +500,150 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 		t->pending_event = false;
 	}
 
-	if (update_total)
-		memcpy(group->polling_total, total,
-				sizeof(group->polling_total));
+	return now + group->rtpoll_min_period;
+}
 
-	return now + group->poll_min_period;
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+	unsigned long missed_periods = 0;
+	u64 expires, period;
+	u64 avg_next_update;
+	int s;
+
+	/* avgX= */
+	expires = group->avg_next_update;
+	if (now - expires >= psi_period)
+		missed_periods = div_u64(now - expires, psi_period);
+
+	/*
+	 * The periodic clock tick can get delayed for various
+	 * reasons, especially on loaded systems. To avoid clock
+	 * drift, we schedule the clock in fixed psi_period intervals.
+	 * But the deltas we sample out of the per-cpu buckets above
+	 * are based on the actual time elapsing between clock ticks.
+	 */
+	avg_next_update = expires + ((1 + missed_periods) * psi_period);
+	period = now - (group->avg_last_update + (missed_periods * psi_period));
+	group->avg_last_update = now;
+
+	for (s = 0; s < NR_PSI_STATES - 1; s++) {
+		u32 sample;
+
+		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
+		/*
+		 * Due to the lockless sampling of the time buckets,
+		 * recorded time deltas can slip into the next period,
+		 * which under full pressure can result in samples in
+		 * excess of the period length.
+		 *
+		 * We don't want to report non-sensical pressures in
+		 * excess of 100%, nor do we want to drop such events
+		 * on the floor. Instead we punt any overage into the
+		 * future until pressure subsides. By doing this we
+		 * don't underreport the occurring pressure curve, we
+		 * just report it delayed by one period length.
+		 *
+		 * The error isn't cumulative. As soon as another
+		 * delta slips from a period P to P+1, by definition
+		 * it frees up its time T in P.
+		 */
+		if (sample > period)
+			sample = period;
+		group->avg_total[s] += sample;
+		calc_avgs(group->avg[s], missed_periods, sample, period);
+	}
+
+	return avg_next_update;
+}
+
+static void psi_avgs_work(struct work_struct *work)
+{
+	struct delayed_work *dwork;
+	struct psi_group *group;
+	u32 changed_states;
+	bool update_total;
+	u64 now;
+
+	dwork = to_delayed_work(work);
+	group = container_of(dwork, struct psi_group, avgs_work);
+
+	mutex_lock(&group->avgs_lock);
+
+	now = sched_clock();
+
+	collect_percpu_times(group, PSI_AVGS, &changed_states);
+	/*
+	 * If there is task activity, periodically fold the per-cpu
+	 * times and feed samples into the running averages. If things
+	 * are idle and there is no data to process, stop the clock.
+	 * Once restarted, we'll catch up the running averages in one
+	 * go - see calc_avgs() and missed_periods.
+	 */
+	if (now >= group->avg_next_update) {
+		update_triggers(group, now, &update_total, PSI_AVGS);
+		group->avg_next_update = update_averages(group, now);
+	}
+
+	if (changed_states & PSI_STATE_RESCHEDULE) {
+		schedule_delayed_work(dwork, nsecs_to_jiffies(
+				group->avg_next_update - now) + 1);
+	}
+
+	mutex_unlock(&group->avgs_lock);
+}
+
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
+{
+	struct psi_trigger *t;
+
+	list_for_each_entry(t, &group->rtpoll_triggers, node)
+		window_reset(&t->win, now,
+				group->total[PSI_POLL][t->state], 0);
+	memcpy(group->rtpoll_total, group->total[PSI_POLL],
+		   sizeof(group->rtpoll_total));
+	group->rtpoll_next_update = now + group->rtpoll_min_period;
 }
 
 /* Schedule polling if it's not already scheduled or forced. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
+static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
 				   bool force)
 {
 	struct task_struct *task;
 
 	/*
 	 * atomic_xchg should be called even when !force to provide a
-	 * full memory barrier (see the comment inside psi_poll_work).
+	 * full memory barrier (see the comment inside psi_rtpoll_work).
 	 */
-	if (atomic_xchg(&group->poll_scheduled, 1) && !force)
+	if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
 		return;
 
 	rcu_read_lock();
 
-	task = rcu_dereference(group->poll_task);
+	task = rcu_dereference(group->rtpoll_task);
 	/*
 	 * kworker might be NULL in case psi_trigger_destroy races with
 	 * psi_task_change (hotpath) which can't use locks
 	 */
 	if (likely(task))
-		mod_timer(&group->poll_timer, jiffies + delay);
+		mod_timer(&group->rtpoll_timer, jiffies + delay);
 	else
-		atomic_set(&group->poll_scheduled, 0);
+		atomic_set(&group->rtpoll_scheduled, 0);
 
 	rcu_read_unlock();
 }
 
-static void psi_poll_work(struct psi_group *group)
+static void psi_rtpoll_work(struct psi_group *group)
 {
 	bool force_reschedule = false;
 	u32 changed_states;
+	bool update_total;
 	u64 now;
 
-	mutex_lock(&group->trigger_lock);
+	mutex_lock(&group->rtpoll_trigger_lock);
 
 	now = sched_clock();
 
-	if (now > group->polling_until) {
+	if (now > group->rtpoll_until) {
 		/*
 		 * We are either about to start or might stop polling if no
 		 * state change was recorded. Resetting poll_scheduled leaves
@@ -638,7 +653,7 @@ static void psi_poll_work(struct psi_group *group)
 		 * should be negligible and polling_next_update still keeps
 		 * updates correctly on schedule.
 		 */
-		atomic_set(&group->poll_scheduled, 0);
+		atomic_set(&group->rtpoll_scheduled, 0);
 		/*
 		 * A task change can race with the poll worker that is supposed to
 		 * report on it. To avoid missing events, ensure ordering between
@@ -667,60 +682,64 @@ static void psi_poll_work(struct psi_group *group)
 
 	collect_percpu_times(group, PSI_POLL, &changed_states);
 
-	if (changed_states & group->poll_states) {
+	if (changed_states & group->rtpoll_states) {
 		/* Initialize trigger windows when entering polling mode */
-		if (now > group->polling_until)
-			init_triggers(group, now);
+		if (now > group->rtpoll_until)
+			init_rtpoll_triggers(group, now);
 
 		/*
 		 * Keep the monitor active for at least the duration of the
 		 * minimum tracking window as long as monitor states are
 		 * changing.
 		 */
-		group->polling_until = now +
-			group->poll_min_period * UPDATES_PER_WINDOW;
+		group->rtpoll_until = now +
+			group->rtpoll_min_period * UPDATES_PER_WINDOW;
 	}
 
-	if (now > group->polling_until) {
-		group->polling_next_update = ULLONG_MAX;
+	if (now > group->rtpoll_until) {
+		group->rtpoll_next_update = ULLONG_MAX;
 		goto out;
 	}
 
-	if (now >= group->polling_next_update)
-		group->polling_next_update = update_triggers(group, now);
+	if (now >= group->rtpoll_next_update) {
+		group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
+		if (update_total)
+			memcpy(group->rtpoll_total, group->total[PSI_POLL],
+				   sizeof(group->rtpoll_total));
+	}
 
-	psi_schedule_poll_work(group,
-		nsecs_to_jiffies(group->polling_next_update - now) + 1,
+	psi_schedule_rtpoll_work(group,
+		nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
 		force_reschedule);
 
 out:
-	mutex_unlock(&group->trigger_lock);
+	mutex_unlock(&group->rtpoll_trigger_lock);
 }
 
-static int psi_poll_worker(void *data)
+static int psi_rtpoll_worker(void *data)
 {
 	struct psi_group *group = (struct psi_group *)data;
 
 	sched_set_fifo_low(current);
 
 	while (true) {
-		wait_event_interruptible(group->poll_wait,
-				atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+		wait_event_interruptible(group->rtpoll_wait,
+				atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
 				kthread_should_stop());
 		if (kthread_should_stop())
 			break;
 
-		psi_poll_work(group);
+		psi_rtpoll_work(group);
 	}
 	return 0;
 }
 
 static void poll_timer_fn(struct timer_list *t)
 {
-	struct psi_group *group = from_timer(group, t, poll_timer);
+	struct psi_group *group = from_timer(group, t, rtpoll_timer);
 
-	atomic_set(&group->poll_wakeup, 1);
-	wake_up_interruptible(&group->poll_wait);
+	atomic_set(&group->rtpoll_wakeup, 1);
+	wake_up_interruptible(&group->rtpoll_wait);
 }
 
 static void record_times(struct psi_group_cpu *groupc, u64 now)
@@ -851,8 +870,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 	write_seqcount_end(&groupc->seq);
 
-	if (state_mask & group->poll_states)
-		psi_schedule_poll_work(group, 1, false);
+	if (state_mask & group->rtpoll_states)
+		psi_schedule_rtpoll_work(group, 1, false);
 
 	if (wake_clock && !delayed_work_pending(&group->avgs_work))
 		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -1005,8 +1024,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 
 		write_seqcount_end(&groupc->seq);
 
-		if (group->poll_states & (1 << PSI_IRQ_FULL))
-			psi_schedule_poll_work(group, 1, false);
+		if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
+			psi_schedule_rtpoll_work(group, 1, false);
 	} while ((group = group->parent));
 }
 #endif
@@ -1101,7 +1120,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
 	cancel_delayed_work_sync(&cgroup->psi->avgs_work);
 	free_percpu(cgroup->psi->pcpu);
 	/* All triggers must be removed by now */
-	WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
+	WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
 	kfree(cgroup->psi);
 }
 
@@ -1253,16 +1272,23 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 }
 
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, enum psi_res res)
+			char *buf, enum psi_res res, struct file *file)
 {
 	struct psi_trigger *t;
 	enum psi_states state;
 	u32 threshold_us;
+	bool privileged;
 	u32 window_us;
 
 	if (static_branch_likely(&psi_disabled))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	/*
+	 * Checking the privilege here on file->f_cred implies that a privileged user
+	 * could open the file and delegate the write to an unprivileged one.
+	 */
+	privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
 	if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
 		state = PSI_IO_SOME + res * 2;
 	else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1278,8 +1304,14 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	if (state >= PSI_NONIDLE)
 		return ERR_PTR(-EINVAL);
 
-	if (window_us < WINDOW_MIN_US ||
-		window_us > WINDOW_MAX_US)
+	if (window_us == 0 || window_us > WINDOW_MAX_US)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Unprivileged users can only use 2s windows so that averages aggregation
+	 * work is used, and no RT threads need to be spawned.
+	 */
+	if (!privileged && window_us % 2000000)
 		return ERR_PTR(-EINVAL);
 
 	/* Check threshold */
@@ -1301,31 +1333,40 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->last_event_time = 0;
 	init_waitqueue_head(&t->event_wait);
 	t->pending_event = false;
+	t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
 
-	mutex_lock(&group->trigger_lock);
+	if (privileged) {
+		mutex_lock(&group->rtpoll_trigger_lock);
 
-	if (!rcu_access_pointer(group->poll_task)) {
-		struct task_struct *task;
+		if (!rcu_access_pointer(group->rtpoll_task)) {
+			struct task_struct *task;
 
-		task = kthread_create(psi_poll_worker, group, "psimon");
-		if (IS_ERR(task)) {
-			kfree(t);
-			mutex_unlock(&group->trigger_lock);
-			return ERR_CAST(task);
+			task = kthread_create(psi_rtpoll_worker, group, "psimon");
+			if (IS_ERR(task)) {
+				kfree(t);
+				mutex_unlock(&group->rtpoll_trigger_lock);
+				return ERR_CAST(task);
+			}
+			atomic_set(&group->rtpoll_wakeup, 0);
+			wake_up_process(task);
+			rcu_assign_pointer(group->rtpoll_task, task);
 		}
-		atomic_set(&group->poll_wakeup, 0);
-		wake_up_process(task);
-		rcu_assign_pointer(group->poll_task, task);
-	}
 
-	list_add(&t->node, &group->triggers);
-	group->poll_min_period = min(group->poll_min_period,
-		div_u64(t->win.size, UPDATES_PER_WINDOW));
-	group->nr_triggers[t->state]++;
-	group->poll_states |= (1 << t->state);
+		list_add(&t->node, &group->rtpoll_triggers);
+		group->rtpoll_min_period = min(group->rtpoll_min_period,
+			div_u64(t->win.size, UPDATES_PER_WINDOW));
+		group->rtpoll_nr_triggers[t->state]++;
+		group->rtpoll_states |= (1 << t->state);
+
+		mutex_unlock(&group->rtpoll_trigger_lock);
+	} else {
+		mutex_lock(&group->avgs_lock);
 
-	mutex_unlock(&group->trigger_lock);
+		list_add(&t->node, &group->avg_triggers);
+		group->avg_nr_triggers[t->state]++;
 
+		mutex_unlock(&group->avgs_lock);
+	}
 	return t;
 }
 
@@ -1349,51 +1390,64 @@ void psi_trigger_destroy(struct psi_trigger *t)
 	 */
 	wake_up_pollfree(&t->event_wait);
 
-	mutex_lock(&group->trigger_lock);
-
-	if (!list_empty(&t->node)) {
-		struct psi_trigger *tmp;
-		u64 period = ULLONG_MAX;
-
-		list_del(&t->node);
-		group->nr_triggers[t->state]--;
-		if (!group->nr_triggers[t->state])
-			group->poll_states &= ~(1 << t->state);
-		/* reset min update period for the remaining triggers */
-		list_for_each_entry(tmp, &group->triggers, node)
-			period = min(period, div_u64(tmp->win.size,
-					UPDATES_PER_WINDOW));
-		group->poll_min_period = period;
-		/* Destroy poll_task when the last trigger is destroyed */
-		if (group->poll_states == 0) {
-			group->polling_until = 0;
-			task_to_destroy = rcu_dereference_protected(
-					group->poll_task,
-					lockdep_is_held(&group->trigger_lock));
-			rcu_assign_pointer(group->poll_task, NULL);
-			del_timer(&group->poll_timer);
+	if (t->aggregator == PSI_AVGS) {
+		mutex_lock(&group->avgs_lock);
+		if (!list_empty(&t->node)) {
+			list_del(&t->node);
+			group->avg_nr_triggers[t->state]--;
 		}
+		mutex_unlock(&group->avgs_lock);
+	} else {
+		mutex_lock(&group->rtpoll_trigger_lock);
+		if (!list_empty(&t->node)) {
+			struct psi_trigger *tmp;
+			u64 period = ULLONG_MAX;
+
+			list_del(&t->node);
+			group->rtpoll_nr_triggers[t->state]--;
+			if (!group->rtpoll_nr_triggers[t->state])
+				group->rtpoll_states &= ~(1 << t->state);
+			/*
+			 * Reset min update period for the remaining triggers
+			 * iff the destroying trigger had the min window size.
+			 */
+			if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+				list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+					period = min(period, div_u64(tmp->win.size,
+							UPDATES_PER_WINDOW));
+				group->rtpoll_min_period = period;
+			}
+			/* Destroy rtpoll_task when the last trigger is destroyed */
+			if (group->rtpoll_states == 0) {
+				group->rtpoll_until = 0;
+				task_to_destroy = rcu_dereference_protected(
+						group->rtpoll_task,
+						lockdep_is_held(&group->rtpoll_trigger_lock));
+				rcu_assign_pointer(group->rtpoll_task, NULL);
+				del_timer(&group->rtpoll_timer);
+			}
+		}
+		mutex_unlock(&group->rtpoll_trigger_lock);
 	}
 
-	mutex_unlock(&group->trigger_lock);
-
 	/*
-	 * Wait for psi_schedule_poll_work RCU to complete its read-side
+	 * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
 	 * critical section before destroying the trigger and optionally the
-	 * poll_task.
+	 * rtpoll_task.
 	 */
 	synchronize_rcu();
 	/*
-	 * Stop kthread 'psimon' after releasing trigger_lock to prevent a
-	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
+	 * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
+	 * a deadlock while waiting for psi_rtpoll_work to acquire
+	 * rtpoll_trigger_lock
 	 */
 	if (task_to_destroy) {
 		/*
 		 * After the RCU grace period has expired, the worker
-		 * can no longer be found through group->poll_task.
+		 * can no longer be found through group->rtpoll_task.
 		 */
 		kthread_stop(task_to_destroy);
-		atomic_set(&group->poll_scheduled, 0);
+		atomic_set(&group->rtpoll_scheduled, 0);
 	}
 	kfree(t);
 }
@@ -1435,27 +1489,19 @@ static int psi_cpu_show(struct seq_file *m, void *v)
 	return psi_show(m, &psi_system, PSI_CPU);
 }
 
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
-	if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	return single_open(file, psi_show, NULL);
-}
-
 static int psi_io_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_io_show);
+	return single_open(file, psi_io_show, NULL);
 }
 
 static int psi_memory_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_memory_show);
+	return single_open(file, psi_memory_show, NULL);
 }
 
 static int psi_cpu_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_cpu_show);
+	return single_open(file, psi_cpu_show, NULL);
 }
 
 static ssize_t psi_write(struct file *file, const char __user *user_buf,
@@ -1489,7 +1535,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
 		return -EBUSY;
 	}
 
-	new = psi_trigger_create(&psi_system, buf, res);
+	new = psi_trigger_create(&psi_system, buf, res, file);
 	if (IS_ERR(new)) {
 		mutex_unlock(&seq->lock);
 		return PTR_ERR(new);
@@ -1569,7 +1615,7 @@ static int psi_irq_show(struct seq_file *m, void *v)
 
 static int psi_irq_open(struct inode *inode, struct file *file)
 {
-	return psi_open(file, psi_irq_show);
+	return single_open(file, psi_irq_show, NULL);
 }
 
 static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f5796dd26a5..00e0e5074115 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2681,6 +2681,21 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 		return 0;
 }
 
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_rt(struct task_struct *p, int cpu)
+{
+	struct rt_rq *rt_rq;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	rt_rq = task_group(p)->rt_rq[cpu];
+#else
+	rt_rq = &cpu_rq(cpu)->rt;
+#endif
+
+	return rt_rq_throttled(rt_rq);
+}
+#endif
+
 DEFINE_SCHED_CLASS(rt) = {
 
 	.enqueue_task		= enqueue_task_rt,
@@ -2714,6 +2729,10 @@ DEFINE_SCHED_CLASS(rt) = {
 
 	.update_curr		= update_curr_rt,
 
+#ifdef CONFIG_SCHED_CORE
+	.task_is_throttled	= task_is_throttled_rt,
+#endif
+
 #ifdef CONFIG_UCLAMP_TASK
 	.uclamp_enabled		= 1,
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3e8df6d31c1e..cbea91a9d764 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -286,12 +286,6 @@ struct rt_bandwidth {
 
 void __dl_clear_params(struct task_struct *p);
 
-struct dl_bandwidth {
-	raw_spinlock_t		dl_runtime_lock;
-	u64			dl_runtime;
-	u64			dl_period;
-};
-
 static inline int dl_bandwidth_enabled(void)
 {
 	return sysctl_sched_rt_runtime >= 0;
@@ -1018,6 +1012,7 @@ struct rq {
 	u64			clock;
 	/* Ensure that all clocks are in the same cache line */
 	u64			clock_task ____cacheline_aligned;
+	u64			clock_task_mult;
 	u64			clock_pelt;
 	unsigned long		lost_idle_time;
 	u64			clock_pelt_idle;
@@ -1772,6 +1767,13 @@ queue_balance_callback(struct rq *rq,
 	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
 			__sd; __sd = __sd->parent)
 
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
 /**
  * highest_flag_domain - Return highest sched_domain containing flag.
  * @cpu:	The CPU whose highest level of sched domain is to
@@ -1779,16 +1781,25 @@ queue_balance_callback(struct rq *rq,
  * @flag:	The flag to check for the highest sched_domain
  *		for the given CPU.
  *
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
  */
 static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 {
 	struct sched_domain *sd, *hsd = NULL;
 
 	for_each_domain(cpu, sd) {
-		if (!(sd->flags & flag))
+		if (sd->flags & flag) {
+			hsd = sd;
+			continue;
+		}
+
+		/*
+		 * Stop the search if @flag is known to be shared at lower
+		 * levels. It will not be found further up.
+		 */
+		if (flag & SD_SHARED_CHILD_MASK)
 			break;
-		hsd = sd;
 	}
 
 	return hsd;
@@ -2224,6 +2235,10 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_change_group)(struct task_struct *p);
 #endif
+
+#ifdef CONFIG_SCHED_CORE
+	int (*task_is_throttled)(struct task_struct *p, int cpu);
+#endif
 };
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
@@ -2374,7 +2389,6 @@ extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 051aaf65c749..ca4472281c28 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,8 +209,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
 static unsigned int sysctl_sched_energy_aware = 1;
-DEFINE_MUTEX(sched_energy_mutex);
-bool sched_energy_update;
+static DEFINE_MUTEX(sched_energy_mutex);
+static bool sched_energy_update;
 
 void rebuild_sched_domains_energy(void)
 {
@@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
-			if (parent->parent)
+
+			if (parent->parent) {
 				parent->parent->child = tmp;
+				if (tmp->flags & SD_SHARE_CPUCAPACITY)
+					parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+			}
+
 			/*
 			 * Transfer SD_PREFER_SIBLING down in case of a
 			 * degenerate parent; the spans match for this
-- 
2.41.0

From 5d8fbab87081a9cd7664a506396c5fe0a07a606b Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Wed, 14 Jun 2023 19:27:15 +0200
Subject: [PATCH 11/12] surface

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 MAINTAINERS                                   |   7 +
 arch/x86/kernel/acpi/boot.c                   |  24 +
 drivers/acpi/acpi_tad.c                       |  35 +-
 drivers/acpi/scan.c                           |   3 +
 drivers/bluetooth/btusb.c                     |  15 +
 drivers/hid/Kconfig                           |   4 +
 drivers/hid/Makefile                          |   3 +
 drivers/hid/hid-multitouch.c                  | 196 ++++++-
 drivers/hid/ipts/Kconfig                      |  14 +
 drivers/hid/ipts/Makefile                     |  14 +
 drivers/hid/ipts/cmd.c                        |  62 ++
 drivers/hid/ipts/cmd.h                        |  61 ++
 drivers/hid/ipts/context.h                    |  51 ++
 drivers/hid/ipts/control.c                    | 495 ++++++++++++++++
 drivers/hid/ipts/control.h                    | 127 +++++
 drivers/hid/ipts/desc.h                       |  81 +++
 drivers/hid/ipts/hid.c                        | 343 +++++++++++
 drivers/hid/ipts/hid.h                        |  22 +
 drivers/hid/ipts/main.c                       | 127 +++++
 drivers/hid/ipts/mei.c                        | 189 +++++++
 drivers/hid/ipts/mei.h                        |  67 +++
 drivers/hid/ipts/receiver.c                   | 249 ++++++++
 drivers/hid/ipts/receiver.h                   |  17 +
 drivers/hid/ipts/resources.c                  | 135 +++++
 drivers/hid/ipts/resources.h                  |  42 ++
 drivers/hid/ipts/spec-data.h                  | 100 ++++
 drivers/hid/ipts/spec-device.h                | 285 ++++++++++
 drivers/hid/ipts/spec-hid.h                   |  35 ++
 drivers/hid/ipts/thread.c                     |  85 +++
 drivers/hid/ipts/thread.h                     |  60 ++
 drivers/hid/ithc/Kbuild                       |   6 +
 drivers/hid/ithc/Kconfig                      |  12 +
 drivers/hid/ithc/ithc-debug.c                 |  96 ++++
 drivers/hid/ithc/ithc-dma.c                   | 258 +++++++++
 drivers/hid/ithc/ithc-dma.h                   |  67 +++
 drivers/hid/ithc/ithc-main.c                  | 534 ++++++++++++++++++
 drivers/hid/ithc/ithc-regs.c                  |  64 +++
 drivers/hid/ithc/ithc-regs.h                  | 186 ++++++
 drivers/hid/ithc/ithc.h                       |  60 ++
 drivers/i2c/i2c-core-acpi.c                   |  35 ++
 drivers/input/misc/soc_button_array.c         |  33 +-
 drivers/iommu/intel/iommu.c                   |  54 ++
 drivers/iommu/intel/irq_remapping.c           |  16 +
 drivers/leds/Kconfig                          |  12 +
 drivers/leds/Makefile                         |   1 +
 drivers/leds/leds-tps68470.c                  | 185 ++++++
 drivers/media/i2c/Kconfig                     |  11 +
 drivers/media/i2c/Makefile                    |   1 +
 drivers/media/i2c/dw9719.c                    | 425 ++++++++++++++
 drivers/media/i2c/ov7251.c                    |   4 +-
 drivers/media/pci/intel/ipu3/ipu3-cio2-main.c |  56 +-
 drivers/media/v4l2-core/v4l2-async.c          |   4 +
 drivers/media/v4l2-core/v4l2-fwnode.c         |   4 -
 drivers/misc/mei/hw-me-regs.h                 |   1 +
 drivers/misc/mei/pci-me.c                     |   1 +
 drivers/net/wireless/ath/ath10k/core.c        |  58 ++
 drivers/net/wireless/marvell/mwifiex/pcie.c   |  19 +
 .../wireless/marvell/mwifiex/pcie_quirks.c    |  37 +-
 .../wireless/marvell/mwifiex/pcie_quirks.h    |   2 +
 drivers/pci/pci-driver.c                      |   3 +
 drivers/pci/quirks.c                          |  36 ++
 drivers/platform/surface/Kconfig              |   7 +
 drivers/platform/surface/Makefile             |   1 +
 drivers/platform/surface/surface3-wmi.c       |   7 +
 .../surface/surface_aggregator_registry.c     |   2 +-
 .../surface/surface_aggregator_tabletsw.c     | 185 ++++--
 drivers/platform/surface/surface_gpe.c        |  17 +
 .../surface/surfacebook1_dgpu_switch.c        | 162 ++++++
 drivers/platform/surface/surfacepro3_button.c |  30 +-
 drivers/platform/x86/intel/int3472/discrete.c |  14 +
 drivers/platform/x86/intel/int3472/tps68470.c |  12 +-
 drivers/usb/core/quirks.c                     |   3 +
 include/linux/mfd/tps68470.h                  |   5 +
 include/linux/pci.h                           |   1 +
 include/linux/surface_aggregator/device.h     |   6 +-
 sound/soc/codecs/rt5645.c                     |   9 +
 .../intel/common/soc-acpi-intel-cht-match.c   |   8 +
 77 files changed, 5542 insertions(+), 156 deletions(-)
 create mode 100644 drivers/hid/ipts/Kconfig
 create mode 100644 drivers/hid/ipts/Makefile
 create mode 100644 drivers/hid/ipts/cmd.c
 create mode 100644 drivers/hid/ipts/cmd.h
 create mode 100644 drivers/hid/ipts/context.h
 create mode 100644 drivers/hid/ipts/control.c
 create mode 100644 drivers/hid/ipts/control.h
 create mode 100644 drivers/hid/ipts/desc.h
 create mode 100644 drivers/hid/ipts/hid.c
 create mode 100644 drivers/hid/ipts/hid.h
 create mode 100644 drivers/hid/ipts/main.c
 create mode 100644 drivers/hid/ipts/mei.c
 create mode 100644 drivers/hid/ipts/mei.h
 create mode 100644 drivers/hid/ipts/receiver.c
 create mode 100644 drivers/hid/ipts/receiver.h
 create mode 100644 drivers/hid/ipts/resources.c
 create mode 100644 drivers/hid/ipts/resources.h
 create mode 100644 drivers/hid/ipts/spec-data.h
 create mode 100644 drivers/hid/ipts/spec-device.h
 create mode 100644 drivers/hid/ipts/spec-hid.h
 create mode 100644 drivers/hid/ipts/thread.c
 create mode 100644 drivers/hid/ipts/thread.h
 create mode 100644 drivers/hid/ithc/Kbuild
 create mode 100644 drivers/hid/ithc/Kconfig
 create mode 100644 drivers/hid/ithc/ithc-debug.c
 create mode 100644 drivers/hid/ithc/ithc-dma.c
 create mode 100644 drivers/hid/ithc/ithc-dma.h
 create mode 100644 drivers/hid/ithc/ithc-main.c
 create mode 100644 drivers/hid/ithc/ithc-regs.c
 create mode 100644 drivers/hid/ithc/ithc-regs.h
 create mode 100644 drivers/hid/ithc/ithc.h
 create mode 100644 drivers/leds/leds-tps68470.c
 create mode 100644 drivers/media/i2c/dw9719.c
 create mode 100644 drivers/platform/surface/surfacebook1_dgpu_switch.c

diff --git a/MAINTAINERS b/MAINTAINERS
index ee59cc25b910..363268ade255 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6237,6 +6237,13 @@ T:	git git://linuxtv.org/media_tree.git
 F:	Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml
 F:	drivers/media/i2c/dw9714.c
 
+DONGWOON DW9719 LENS VOICE COIL DRIVER
+M:	Daniel Scally <djrscally@gmail.com>
+L:	linux-media@vger.kernel.org
+S:	Maintained
+T:	git git://linuxtv.org/media_tree.git
+F:	drivers/media/i2c/dw9719.c
+
 DONGWOON DW9768 LENS VOICE COIL DRIVER
 M:	Dongchun Zhu <dongchun.zhu@mediatek.com>
 L:	linux-media@vger.kernel.org
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 21b542a6866c..4e94eed20088 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -22,6 +22,7 @@
 #include <linux/efi-bgrt.h>
 #include <linux/serial_core.h>
 #include <linux/pgtable.h>
+#include <linux/dmi.h>
 
 #include <asm/e820/api.h>
 #include <asm/irqdomain.h>
@@ -1252,6 +1253,24 @@ static void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
+static const struct dmi_system_id surface_quirk[] __initconst = {
+	{
+		.ident = "Microsoft Surface Laptop 4 (AMD 15\")",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
+			DMI_MATCH(DMI_PRODUCT_SKU, "Surface_Laptop_4_1952:1953")
+		},
+	},
+	{
+		.ident = "Microsoft Surface Laptop 4 (AMD 13\")",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
+			DMI_MATCH(DMI_PRODUCT_SKU, "Surface_Laptop_4_1958:1959")
+		},
+	},
+	{}
+};
+
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
@@ -1307,6 +1326,11 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
 				      acpi_gbl_FADT.sci_interrupt);
 
+	if (dmi_check_system(surface_quirk)) {
+		pr_warn("Surface hack: Override irq 7\n");
+		mp_override_legacy_irq(7, 3, 3, 7);
+	}
+
 	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
 
diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c
index e9b8e8305e23..944276934e7e 100644
--- a/drivers/acpi/acpi_tad.c
+++ b/drivers/acpi/acpi_tad.c
@@ -432,6 +432,14 @@ static ssize_t caps_show(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR_RO(caps);
 
+static struct attribute *acpi_tad_attrs[] = {
+	&dev_attr_caps.attr,
+	NULL,
+};
+static const struct attribute_group acpi_tad_attr_group = {
+	.attrs	= acpi_tad_attrs,
+};
+
 static ssize_t ac_alarm_store(struct device *dev, struct device_attribute *attr,
 			      const char *buf, size_t count)
 {
@@ -480,15 +488,14 @@ static ssize_t ac_status_show(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR_RW(ac_status);
 
-static struct attribute *acpi_tad_attrs[] = {
-	&dev_attr_caps.attr,
+static struct attribute *acpi_tad_ac_attrs[] = {
 	&dev_attr_ac_alarm.attr,
 	&dev_attr_ac_policy.attr,
 	&dev_attr_ac_status.attr,
 	NULL,
 };
-static const struct attribute_group acpi_tad_attr_group = {
-	.attrs	= acpi_tad_attrs,
+static const struct attribute_group acpi_tad_ac_attr_group = {
+	.attrs	= acpi_tad_ac_attrs,
 };
 
 static ssize_t dc_alarm_store(struct device *dev, struct device_attribute *attr,
@@ -563,13 +570,18 @@ static int acpi_tad_remove(struct platform_device *pdev)
 
 	pm_runtime_get_sync(dev);
 
+	if (dd->capabilities & ACPI_TAD_AC_WAKE)
+		sysfs_remove_group(&dev->kobj, &acpi_tad_ac_attr_group);
+
 	if (dd->capabilities & ACPI_TAD_DC_WAKE)
 		sysfs_remove_group(&dev->kobj, &acpi_tad_dc_attr_group);
 
 	sysfs_remove_group(&dev->kobj, &acpi_tad_attr_group);
 
-	acpi_tad_disable_timer(dev, ACPI_TAD_AC_TIMER);
-	acpi_tad_clear_status(dev, ACPI_TAD_AC_TIMER);
+	if (dd->capabilities & ACPI_TAD_AC_WAKE) {
+		acpi_tad_disable_timer(dev, ACPI_TAD_AC_TIMER);
+		acpi_tad_clear_status(dev, ACPI_TAD_AC_TIMER);
+	}
 	if (dd->capabilities & ACPI_TAD_DC_WAKE) {
 		acpi_tad_disable_timer(dev, ACPI_TAD_DC_TIMER);
 		acpi_tad_clear_status(dev, ACPI_TAD_DC_TIMER);
@@ -604,11 +616,6 @@ static int acpi_tad_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
-	if (!acpi_has_method(handle, "_PRW")) {
-		dev_info(dev, "Missing _PRW\n");
-		return -ENODEV;
-	}
-
 	dd = devm_kzalloc(dev, sizeof(*dd), GFP_KERNEL);
 	if (!dd)
 		return -ENOMEM;
@@ -637,6 +644,12 @@ static int acpi_tad_probe(struct platform_device *pdev)
 	if (ret)
 		goto fail;
 
+	if (caps & ACPI_TAD_AC_WAKE) {
+		ret = sysfs_create_group(&dev->kobj, &acpi_tad_ac_attr_group);
+		if (ret)
+			goto fail;
+	}
+
 	if (caps & ACPI_TAD_DC_WAKE) {
 		ret = sysfs_create_group(&dev->kobj, &acpi_tad_dc_attr_group);
 		if (ret)
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 0c6f06abe3f4..4fc320f424e8 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2106,6 +2106,9 @@ static acpi_status acpi_bus_check_add_2(acpi_handle handle, u32 lvl_not_used,
 
 static void acpi_default_enumeration(struct acpi_device *device)
 {
+	if (!acpi_dev_ready_for_enumeration(device))
+		return;
+
 	/*
 	 * Do not enumerate devices with enumeration_by_parent flag set as
 	 * they will be enumerated by their respective parents.
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index d40de705b9ed..f179601f1a6e 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -65,6 +65,7 @@ static struct usb_driver btusb_driver;
 #define BTUSB_INTEL_BROKEN_INITIAL_NCMD BIT(25)
 #define BTUSB_INTEL_NO_WBS_SUPPORT	BIT(26)
 #define BTUSB_ACTIONS_SEMI		BIT(27)
+#define BTUSB_LOWER_LESCAN_INTERVAL	BIT(28)
 
 static const struct usb_device_id btusb_table[] = {
 	/* Generic Bluetooth USB device */
@@ -468,6 +469,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x1286, 0x2044), .driver_info = BTUSB_MARVELL },
 	{ USB_DEVICE(0x1286, 0x2046), .driver_info = BTUSB_MARVELL },
 	{ USB_DEVICE(0x1286, 0x204e), .driver_info = BTUSB_MARVELL },
+	{ USB_DEVICE(0x1286, 0x204c), .driver_info = BTUSB_LOWER_LESCAN_INTERVAL },
 
 	/* Intel Bluetooth devices */
 	{ USB_DEVICE(0x8087, 0x0025), .driver_info = BTUSB_INTEL_COMBINED },
@@ -4036,6 +4038,19 @@ static int btusb_probe(struct usb_interface *intf,
 	if (id->driver_info & BTUSB_MARVELL)
 		hdev->set_bdaddr = btusb_set_bdaddr_marvell;
 
+	/* The Marvell 88W8897 combined wifi and bluetooth card is known for
+	 * very bad bt+wifi coexisting performance.
+	 *
+	 * Decrease the passive BT Low Energy scan interval a bit
+	 * (0x0190 * 0.625 msec = 250 msec) and make the scan window shorter
+	 * (0x000a * 0,625 msec = 6.25 msec). This allows for significantly
+	 * higher wifi throughput while passively scanning for BT LE devices.
+	 */
+	if (id->driver_info & BTUSB_LOWER_LESCAN_INTERVAL) {
+		hdev->le_scan_interval = 0x0190;
+		hdev->le_scan_window = 0x000a;
+	}
+
 	if (IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) &&
 	    (id->driver_info & BTUSB_MEDIATEK)) {
 		hdev->setup = btusb_mtk_setup;
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 4ce012f83253..91e9e4f58c27 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -1316,4 +1316,8 @@ source "drivers/hid/amd-sfh-hid/Kconfig"
 
 source "drivers/hid/surface-hid/Kconfig"
 
+source "drivers/hid/ipts/Kconfig"
+
+source "drivers/hid/ithc/Kconfig"
+
 endif # HID_SUPPORT
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index 5d37cacbde33..a3ff62e922f1 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -167,3 +167,6 @@ obj-$(INTEL_ISH_FIRMWARE_DOWNLOADER)	+= intel-ish-hid/
 obj-$(CONFIG_AMD_SFH_HID)       += amd-sfh-hid/
 
 obj-$(CONFIG_SURFACE_HID_CORE)  += surface-hid/
+
+obj-$(CONFIG_HID_IPTS)          += ipts/
+obj-$(CONFIG_HID_ITHC)          += ithc/
diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index e31be0cb8b85..508a250ff4bf 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -34,7 +34,10 @@
 #include <linux/device.h>
 #include <linux/hid.h>
 #include <linux/module.h>
+#include <linux/pm_runtime.h>
 #include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/usb.h>
 #include <linux/input/mt.h>
 #include <linux/jiffies.h>
 #include <linux/string.h>
@@ -47,6 +50,7 @@ MODULE_DESCRIPTION("HID multitouch panels");
 MODULE_LICENSE("GPL");
 
 #include "hid-ids.h"
+#include "usbhid/usbhid.h"
 
 /* quirks to control the device */
 #define MT_QUIRK_NOT_SEEN_MEANS_UP	BIT(0)
@@ -72,12 +76,18 @@ MODULE_LICENSE("GPL");
 #define MT_QUIRK_FORCE_MULTI_INPUT	BIT(20)
 #define MT_QUIRK_DISABLE_WAKEUP		BIT(21)
 #define MT_QUIRK_ORIENTATION_INVERT	BIT(22)
+#define MT_QUIRK_HAS_TYPE_COVER_BACKLIGHT	BIT(23)
+#define MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH	BIT(24)
 
 #define MT_INPUTMODE_TOUCHSCREEN	0x02
 #define MT_INPUTMODE_TOUCHPAD		0x03
 
 #define MT_BUTTONTYPE_CLICKPAD		0
 
+#define MS_TYPE_COVER_FEATURE_REPORT_USAGE	0xff050086
+#define MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE	0xff050072
+#define MS_TYPE_COVER_APPLICATION	0xff050050
+
 enum latency_mode {
 	HID_LATENCY_NORMAL = 0,
 	HID_LATENCY_HIGH = 1,
@@ -169,6 +179,8 @@ struct mt_device {
 
 	struct list_head applications;
 	struct list_head reports;
+
+	struct notifier_block pm_notifier;
 };
 
 static void mt_post_parse_default_settings(struct mt_device *td,
@@ -213,6 +225,7 @@ static void mt_post_parse(struct mt_device *td, struct mt_application *app);
 #define MT_CLS_GOOGLE				0x0111
 #define MT_CLS_RAZER_BLADE_STEALTH		0x0112
 #define MT_CLS_SMART_TECH			0x0113
+#define MT_CLS_WIN_8_MS_SURFACE_TYPE_COVER	0x0114
 
 #define MT_DEFAULT_MAXCONTACT	10
 #define MT_MAX_MAXCONTACT	250
@@ -397,6 +410,17 @@ static const struct mt_class mt_classes[] = {
 			MT_QUIRK_CONTACT_CNT_ACCURATE |
 			MT_QUIRK_SEPARATE_APP_REPORT,
 	},
+	{ .name = MT_CLS_WIN_8_MS_SURFACE_TYPE_COVER,
+		.quirks = MT_QUIRK_HAS_TYPE_COVER_BACKLIGHT |
+			MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH |
+			MT_QUIRK_ALWAYS_VALID |
+			MT_QUIRK_IGNORE_DUPLICATES |
+			MT_QUIRK_HOVERING |
+			MT_QUIRK_CONTACT_CNT_ACCURATE |
+			MT_QUIRK_STICKY_FINGERS |
+			MT_QUIRK_WIN8_PTP_BUTTONS,
+		.export_all_inputs = true
+	},
 	{ }
 };
 
@@ -1370,6 +1394,9 @@ static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 	    field->application != HID_CP_CONSUMER_CONTROL &&
 	    field->application != HID_GD_WIRELESS_RADIO_CTLS &&
 	    field->application != HID_GD_SYSTEM_MULTIAXIS &&
+	    !(field->application == MS_TYPE_COVER_APPLICATION &&
+	      application->quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH &&
+	      usage->hid == MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE) &&
 	    !(field->application == HID_VD_ASUS_CUSTOM_MEDIA_KEYS &&
 	      application->quirks & MT_QUIRK_ASUS_CUSTOM_UP))
 		return -1;
@@ -1397,6 +1424,21 @@ static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		return 1;
 	}
 
+	/*
+	 * The Microsoft Surface Pro Typecover has a non-standard HID
+	 * tablet mode switch on a vendor specific usage page with vendor
+	 * specific usage.
+	 */
+	if (field->application == MS_TYPE_COVER_APPLICATION &&
+	    application->quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH &&
+	    usage->hid == MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE) {
+		usage->type = EV_SW;
+		usage->code = SW_TABLET_MODE;
+		*max = SW_MAX;
+		*bit = hi->input->swbit;
+		return 1;
+	}
+
 	if (rdata->is_mt_collection)
 		return mt_touch_input_mapping(hdev, hi, field, usage, bit, max,
 					      application);
@@ -1418,6 +1460,7 @@ static int mt_input_mapped(struct hid_device *hdev, struct hid_input *hi,
 {
 	struct mt_device *td = hid_get_drvdata(hdev);
 	struct mt_report_data *rdata;
+	struct input_dev *input;
 
 	rdata = mt_find_report_data(td, field->report);
 	if (rdata && rdata->is_mt_collection) {
@@ -1425,6 +1468,19 @@ static int mt_input_mapped(struct hid_device *hdev, struct hid_input *hi,
 		return -1;
 	}
 
+	/*
+	 * We own an input device which acts as a tablet mode switch for
+	 * the Surface Pro Typecover.
+	 */
+	if (field->application == MS_TYPE_COVER_APPLICATION &&
+	    rdata->application->quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH &&
+	    usage->hid == MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE) {
+		input = hi->input;
+		input_set_capability(input, EV_SW, SW_TABLET_MODE);
+		input_report_switch(input, SW_TABLET_MODE, 0);
+		return -1;
+	}
+
 	/* let hid-core decide for the others */
 	return 0;
 }
@@ -1434,11 +1490,21 @@ static int mt_event(struct hid_device *hid, struct hid_field *field,
 {
 	struct mt_device *td = hid_get_drvdata(hid);
 	struct mt_report_data *rdata;
+	struct input_dev *input;
 
 	rdata = mt_find_report_data(td, field->report);
 	if (rdata && rdata->is_mt_collection)
 		return mt_touch_event(hid, field, usage, value);
 
+	if (field->application == MS_TYPE_COVER_APPLICATION &&
+	    rdata->application->quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH &&
+	    usage->hid == MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE) {
+		input = field->hidinput->input;
+		input_report_switch(input, SW_TABLET_MODE, (value & 0xFF) != 0x22);
+		input_sync(input);
+		return 1;
+	}
+
 	return 0;
 }
 
@@ -1591,6 +1657,42 @@ static void mt_post_parse(struct mt_device *td, struct mt_application *app)
 		app->quirks &= ~MT_QUIRK_CONTACT_CNT_ACCURATE;
 }
 
+static int get_type_cover_field(struct hid_report_enum *rep_enum,
+				struct hid_field **field, int usage)
+{
+	struct hid_report *rep;
+	struct hid_field *cur_field;
+	int i, j;
+
+	list_for_each_entry(rep, &rep_enum->report_list, list) {
+		for (i = 0; i < rep->maxfield; i++) {
+			cur_field = rep->field[i];
+			if (cur_field->application != MS_TYPE_COVER_APPLICATION)
+				continue;
+			for (j = 0; j < cur_field->maxusage; j++) {
+				if (cur_field->usage[j].hid == usage) {
+					*field = cur_field;
+					return true;
+				}
+			}
+		}
+	}
+	return false;
+}
+
+static void request_type_cover_tablet_mode_switch(struct hid_device *hdev)
+{
+	struct hid_field *field;
+
+	if (get_type_cover_field(&hdev->report_enum[HID_INPUT_REPORT],
+				 &field,
+				 MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE)) {
+		hid_hw_request(hdev, field->report, HID_REQ_GET_REPORT);
+	} else {
+		hid_err(hdev, "couldn't find tablet mode field\n");
+	}
+}
+
 static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi)
 {
 	struct mt_device *td = hid_get_drvdata(hdev);
@@ -1640,6 +1742,13 @@ static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi)
 		/* force BTN_STYLUS to allow tablet matching in udev */
 		__set_bit(BTN_STYLUS, hi->input->keybit);
 		break;
+	case MS_TYPE_COVER_APPLICATION:
+		if (td->mtclass.quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH) {
+			suffix = "Tablet Mode Switch";
+			request_type_cover_tablet_mode_switch(hdev);
+			break;
+		}
+		fallthrough;
 	default:
 		suffix = "UNKNOWN";
 		break;
@@ -1728,6 +1837,46 @@ static void mt_expired_timeout(struct timer_list *t)
 	clear_bit_unlock(MT_IO_FLAGS_RUNNING, &td->mt_io_flags);
 }
 
+static void update_keyboard_backlight(struct hid_device *hdev, bool enabled)
+{
+	struct usb_device *udev = hid_to_usb_dev(hdev);
+	struct hid_field *field = NULL;
+
+	/* Wake up the device in case it's already suspended */
+	pm_runtime_get_sync(&udev->dev);
+
+	if (!get_type_cover_field(&hdev->report_enum[HID_FEATURE_REPORT],
+				  &field,
+				  MS_TYPE_COVER_FEATURE_REPORT_USAGE)) {
+		hid_err(hdev, "couldn't find backlight field\n");
+		goto out;
+	}
+
+	field->value[field->index] = enabled ? 0x01ff00ff : 0x00ff00ff;
+	hid_hw_request(hdev, field->report, HID_REQ_SET_REPORT);
+
+out:
+	pm_runtime_put_sync(&udev->dev);
+}
+
+static int mt_pm_notifier(struct notifier_block *notifier,
+			  unsigned long pm_event,
+			  void *unused)
+{
+	struct mt_device *td =
+		container_of(notifier, struct mt_device, pm_notifier);
+	struct hid_device *hdev = td->hdev;
+
+	if (td->mtclass.quirks & MT_QUIRK_HAS_TYPE_COVER_BACKLIGHT) {
+		if (pm_event == PM_SUSPEND_PREPARE)
+			update_keyboard_backlight(hdev, 0);
+		else if (pm_event == PM_POST_SUSPEND)
+			update_keyboard_backlight(hdev, 1);
+	}
+
+	return NOTIFY_DONE;
+}
+
 static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 {
 	int ret, i;
@@ -1751,6 +1900,9 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 	td->inputmode_value = MT_INPUTMODE_TOUCHSCREEN;
 	hid_set_drvdata(hdev, td);
 
+	td->pm_notifier.notifier_call = mt_pm_notifier;
+	register_pm_notifier(&td->pm_notifier);
+
 	INIT_LIST_HEAD(&td->applications);
 	INIT_LIST_HEAD(&td->reports);
 
@@ -1789,15 +1941,19 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 	timer_setup(&td->release_timer, mt_expired_timeout, 0);
 
 	ret = hid_parse(hdev);
-	if (ret != 0)
+	if (ret != 0) {
+		unregister_pm_notifier(&td->pm_notifier);
 		return ret;
+	}
 
 	if (mtclass->quirks & MT_QUIRK_FIX_CONST_CONTACT_ID)
 		mt_fix_const_fields(hdev, HID_DG_CONTACTID);
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
-	if (ret)
+	if (ret) {
+		unregister_pm_notifier(&td->pm_notifier);
 		return ret;
+	}
 
 	ret = sysfs_create_group(&hdev->dev.kobj, &mt_attribute_group);
 	if (ret)
@@ -1826,13 +1982,24 @@ static int mt_suspend(struct hid_device *hdev, pm_message_t state)
 
 static int mt_reset_resume(struct hid_device *hdev)
 {
+	struct mt_device *td = hid_get_drvdata(hdev);
+
 	mt_release_contacts(hdev);
 	mt_set_modes(hdev, HID_LATENCY_NORMAL, true, true);
+
+	/* Request an update on the typecover folding state on resume
+	 * after reset.
+	 */
+	if (td->mtclass.quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH)
+		request_type_cover_tablet_mode_switch(hdev);
+
 	return 0;
 }
 
 static int mt_resume(struct hid_device *hdev)
 {
+	struct mt_device *td = hid_get_drvdata(hdev);
+
 	/* Some Elan legacy devices require SET_IDLE to be set on resume.
 	 * It should be safe to send it to other devices too.
 	 * Tested on 3M, Stantum, Cypress, Zytronic, eGalax, and Elan panels. */
@@ -1841,6 +2008,10 @@ static int mt_resume(struct hid_device *hdev)
 
 	mt_set_modes(hdev, HID_LATENCY_NORMAL, true, true);
 
+	/* Request an update on the typecover folding state on resume. */
+	if (td->mtclass.quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH)
+		request_type_cover_tablet_mode_switch(hdev);
+
 	return 0;
 }
 #endif
@@ -1848,7 +2019,23 @@ static int mt_resume(struct hid_device *hdev)
 static void mt_remove(struct hid_device *hdev)
 {
 	struct mt_device *td = hid_get_drvdata(hdev);
+	struct hid_field *field;
+	struct input_dev *input;
 
+	/* Reset tablet mode switch on disconnect. */
+	if (td->mtclass.quirks & MT_QUIRK_HAS_TYPE_COVER_TABLET_MODE_SWITCH) {
+		if (get_type_cover_field(&hdev->report_enum[HID_INPUT_REPORT],
+					 &field,
+					 MS_TYPE_COVER_TABLET_MODE_SWITCH_USAGE)) {
+			input = field->hidinput->input;
+			input_report_switch(input, SW_TABLET_MODE, 0);
+			input_sync(input);
+		} else {
+			hid_err(hdev, "couldn't find tablet mode field\n");
+		}
+	}
+
+	unregister_pm_notifier(&td->pm_notifier);
 	del_timer_sync(&td->release_timer);
 
 	sysfs_remove_group(&hdev->dev.kobj, &mt_attribute_group);
@@ -2226,6 +2413,11 @@ static const struct hid_device_id mt_devices[] = {
 		MT_USB_DEVICE(USB_VENDOR_ID_XIROKU,
 			USB_DEVICE_ID_XIROKU_CSR2) },
 
+	/* Microsoft Surface type cover */
+	{ .driver_data = MT_CLS_WIN_8_MS_SURFACE_TYPE_COVER,
+		HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY,
+			USB_VENDOR_ID_MICROSOFT, 0x09c0) },
+
 	/* Google MT devices */
 	{ .driver_data = MT_CLS_GOOGLE,
 		HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY, USB_VENDOR_ID_GOOGLE,
diff --git a/drivers/hid/ipts/Kconfig b/drivers/hid/ipts/Kconfig
new file mode 100644
index 000000000000..297401bd388d
--- /dev/null
+++ b/drivers/hid/ipts/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config HID_IPTS
+	tristate "Intel Precise Touch & Stylus"
+	depends on INTEL_MEI
+	depends on HID
+	help
+	  Say Y here if your system has a touchscreen using Intels
+	  Precise Touch & Stylus (IPTS) technology.
+
+	  If unsure say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ipts.
diff --git a/drivers/hid/ipts/Makefile b/drivers/hid/ipts/Makefile
new file mode 100644
index 000000000000..0fe655bccdc0
--- /dev/null
+++ b/drivers/hid/ipts/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Makefile for the IPTS touchscreen driver
+#
+
+obj-$(CONFIG_HID_IPTS) += ipts.o
+ipts-objs := cmd.o
+ipts-objs += control.o
+ipts-objs += hid.o
+ipts-objs += main.o
+ipts-objs += mei.o
+ipts-objs += receiver.o
+ipts-objs += resources.o
+ipts-objs += thread.o
diff --git a/drivers/hid/ipts/cmd.c b/drivers/hid/ipts/cmd.c
new file mode 100644
index 000000000000..7fd69271ccd5
--- /dev/null
+++ b/drivers/hid/ipts/cmd.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#include "cmd.h"
+#include "context.h"
+#include "mei.h"
+#include "spec-device.h"
+
+int ipts_cmd_recv_timeout(struct ipts_context *ipts, enum ipts_command_code code,
+			  struct ipts_response *rsp, u64 timeout)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!rsp)
+		return -EFAULT;
+
+	/*
+	 * In a response, the command code will have the most significant bit flipped to 1.
+	 * If code is passed to ipts_mei_recv as is, no messages will be received.
+	 */
+	ret = ipts_mei_recv(&ipts->mei, code | IPTS_RSP_BIT, rsp, timeout);
+	if (ret < 0)
+		return ret;
+
+	dev_dbg(ipts->dev, "Received 0x%02X with status 0x%02X\n", code, rsp->status);
+
+	/*
+	 * Some devices will always return this error.
+	 * It is allowed to ignore it and to try continuing.
+	 */
+	if (rsp->status == IPTS_STATUS_COMPAT_CHECK_FAIL)
+		rsp->status = IPTS_STATUS_SUCCESS;
+
+	return 0;
+}
+
+int ipts_cmd_send(struct ipts_context *ipts, enum ipts_command_code code, void *data, size_t size)
+{
+	struct ipts_command cmd = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	cmd.cmd = code;
+
+	if (data && size > 0)
+		memcpy(cmd.payload, data, size);
+
+	dev_dbg(ipts->dev, "Sending 0x%02X with %ld bytes payload\n", code, size);
+	return ipts_mei_send(&ipts->mei, &cmd, sizeof(cmd.cmd) + size);
+}
diff --git a/drivers/hid/ipts/cmd.h b/drivers/hid/ipts/cmd.h
new file mode 100644
index 000000000000..924758ffee67
--- /dev/null
+++ b/drivers/hid/ipts/cmd.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_CMD_H
+#define IPTS_CMD_H
+
+#include <linux/types.h>
+
+#include "context.h"
+#include "spec-device.h"
+
+/*
+ * The default timeout for receiving responses
+ */
+#define IPTS_CMD_DEFAULT_TIMEOUT 1000
+
+/*
+ * ipts_cmd_recv_timeout() - Receives a response to a command.
+ * @ipts: The IPTS driver context.
+ * @code: The type of the command / response.
+ * @rsp: The address that the received response will be copied to.
+ * @timeout: How many milliseconds the function will wait at most.
+ *
+ * A negative timeout means to wait forever.
+ *
+ * Returns: 0 on success, <0 on error, -EAGAIN if no response has been received.
+ */
+int ipts_cmd_recv_timeout(struct ipts_context *ipts, enum ipts_command_code code,
+			  struct ipts_response *rsp, u64 timeout);
+
+/*
+ * ipts_cmd_recv() - Receives a response to a command.
+ * @ipts: The IPTS driver context.
+ * @code: The type of the command / response.
+ * @rsp: The address that the received response will be copied to.
+ *
+ * Returns: 0 on success, <0 on error, -EAGAIN if no response has been received.
+ */
+static inline int ipts_cmd_recv(struct ipts_context *ipts, enum ipts_command_code code,
+				struct ipts_response *rsp)
+{
+	return ipts_cmd_recv_timeout(ipts, code, rsp, IPTS_CMD_DEFAULT_TIMEOUT);
+}
+
+/*
+ * ipts_cmd_send() - Executes a command on the device.
+ * @ipts: The IPTS driver context.
+ * @code: The type of the command to execute.
+ * @data: The payload containing parameters for the command.
+ * @size: The size of the payload.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_cmd_send(struct ipts_context *ipts, enum ipts_command_code code, void *data, size_t size);
+
+#endif /* IPTS_CMD_H */
diff --git a/drivers/hid/ipts/context.h b/drivers/hid/ipts/context.h
new file mode 100644
index 000000000000..3450a95e66ee
--- /dev/null
+++ b/drivers/hid/ipts/context.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_CONTEXT_H
+#define IPTS_CONTEXT_H
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/hid.h>
+#include <linux/mei_cl_bus.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include "mei.h"
+#include "resources.h"
+#include "spec-device.h"
+#include "thread.h"
+
+struct ipts_context {
+	struct device *dev;
+	struct ipts_mei mei;
+
+	enum ipts_mode mode;
+
+	/*
+	 * Prevents concurrent GET_FEATURE reports.
+	 */
+	struct mutex feature_lock;
+	struct completion feature_event;
+
+	/*
+	 * These are not inside of struct ipts_resources
+	 * because they don't own the memory they point to.
+	 */
+	struct ipts_buffer feature_report;
+	struct ipts_buffer descriptor;
+
+	struct hid_device *hid;
+	struct ipts_device_info info;
+	struct ipts_resources resources;
+
+	struct ipts_thread receiver_loop;
+};
+
+#endif /* IPTS_CONTEXT_H */
diff --git a/drivers/hid/ipts/control.c b/drivers/hid/ipts/control.c
new file mode 100644
index 000000000000..2f61500b5119
--- /dev/null
+++ b/drivers/hid/ipts/control.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/delay.h>
+#include <linux/dev_printk.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+
+#include "cmd.h"
+#include "context.h"
+#include "control.h"
+#include "desc.h"
+#include "hid.h"
+#include "receiver.h"
+#include "resources.h"
+#include "spec-data.h"
+#include "spec-device.h"
+
+static int ipts_control_get_device_info(struct ipts_context *ipts, struct ipts_device_info *info)
+{
+	int ret = 0;
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!info)
+		return -EFAULT;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_GET_DEVICE_INFO, NULL, 0);
+	if (ret) {
+		dev_err(ipts->dev, "GET_DEVICE_INFO: send failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_cmd_recv(ipts, IPTS_CMD_GET_DEVICE_INFO, &rsp);
+	if (ret) {
+		dev_err(ipts->dev, "GET_DEVICE_INFO: recv failed: %d\n", ret);
+		return ret;
+	}
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "GET_DEVICE_INFO: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	memcpy(info, rsp.payload, sizeof(*info));
+	return 0;
+}
+
+static int ipts_control_set_mode(struct ipts_context *ipts, enum ipts_mode mode)
+{
+	int ret = 0;
+	struct ipts_set_mode cmd = { 0 };
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	cmd.mode = mode;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_SET_MODE, &cmd, sizeof(cmd));
+	if (ret) {
+		dev_err(ipts->dev, "SET_MODE: send failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_cmd_recv(ipts, IPTS_CMD_SET_MODE, &rsp);
+	if (ret) {
+		dev_err(ipts->dev, "SET_MODE: recv failed: %d\n", ret);
+		return ret;
+	}
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "SET_MODE: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	return 0;
+}
+
+static int ipts_control_set_mem_window(struct ipts_context *ipts, struct ipts_resources *res)
+{
+	int ret = 0;
+	struct ipts_mem_window cmd = { 0 };
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!res)
+		return -EFAULT;
+
+	for (int i = 0; i < IPTS_BUFFERS; i++) {
+		cmd.data_addr_lower[i] = lower_32_bits(res->data[i].dma_address);
+		cmd.data_addr_upper[i] = upper_32_bits(res->data[i].dma_address);
+		cmd.feedback_addr_lower[i] = lower_32_bits(res->feedback[i].dma_address);
+		cmd.feedback_addr_upper[i] = upper_32_bits(res->feedback[i].dma_address);
+	}
+
+	cmd.workqueue_addr_lower = lower_32_bits(res->workqueue.dma_address);
+	cmd.workqueue_addr_upper = upper_32_bits(res->workqueue.dma_address);
+
+	cmd.doorbell_addr_lower = lower_32_bits(res->doorbell.dma_address);
+	cmd.doorbell_addr_upper = upper_32_bits(res->doorbell.dma_address);
+
+	cmd.hid2me_addr_lower = lower_32_bits(res->hid2me.dma_address);
+	cmd.hid2me_addr_upper = upper_32_bits(res->hid2me.dma_address);
+
+	cmd.workqueue_size = IPTS_WORKQUEUE_SIZE;
+	cmd.workqueue_item_size = IPTS_WORKQUEUE_ITEM_SIZE;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_SET_MEM_WINDOW, &cmd, sizeof(cmd));
+	if (ret) {
+		dev_err(ipts->dev, "SET_MEM_WINDOW: send failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_cmd_recv(ipts, IPTS_CMD_SET_MEM_WINDOW, &rsp);
+	if (ret) {
+		dev_err(ipts->dev, "SET_MEM_WINDOW: recv failed: %d\n", ret);
+		return ret;
+	}
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "SET_MEM_WINDOW: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	return 0;
+}
+
+static int ipts_control_get_descriptor(struct ipts_context *ipts)
+{
+	int ret = 0;
+	struct ipts_data_header *header = NULL;
+	struct ipts_get_descriptor cmd = { 0 };
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!ipts->resources.descriptor.address)
+		return -EFAULT;
+
+	memset(ipts->resources.descriptor.address, 0, ipts->resources.descriptor.size);
+
+	cmd.addr_lower = lower_32_bits(ipts->resources.descriptor.dma_address);
+	cmd.addr_upper = upper_32_bits(ipts->resources.descriptor.dma_address);
+	cmd.magic = 8;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_GET_DESCRIPTOR, &cmd, sizeof(cmd));
+	if (ret) {
+		dev_err(ipts->dev, "GET_DESCRIPTOR: send failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_cmd_recv(ipts, IPTS_CMD_GET_DESCRIPTOR, &rsp);
+	if (ret) {
+		dev_err(ipts->dev, "GET_DESCRIPTOR: recv failed: %d\n", ret);
+		return ret;
+	}
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "GET_DESCRIPTOR: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	header = (struct ipts_data_header *)ipts->resources.descriptor.address;
+
+	if (header->type == IPTS_DATA_TYPE_DESCRIPTOR) {
+		ipts->descriptor.address = &header->data[8];
+		ipts->descriptor.size = header->size - 8;
+
+		return 0;
+	}
+
+	return -ENODATA;
+}
+
+int ipts_control_request_flush(struct ipts_context *ipts)
+{
+	int ret = 0;
+	struct ipts_quiesce_io cmd = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_QUIESCE_IO, &cmd, sizeof(cmd));
+	if (ret)
+		dev_err(ipts->dev, "QUIESCE_IO: send failed: %d\n", ret);
+
+	return ret;
+}
+
+int ipts_control_wait_flush(struct ipts_context *ipts)
+{
+	int ret = 0;
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	ret = ipts_cmd_recv(ipts, IPTS_CMD_QUIESCE_IO, &rsp);
+	if (ret) {
+		dev_err(ipts->dev, "QUIESCE_IO: recv failed: %d\n", ret);
+		return ret;
+	}
+
+	if (rsp.status == IPTS_STATUS_TIMEOUT)
+		return -EAGAIN;
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "QUIESCE_IO: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	return 0;
+}
+
+int ipts_control_request_data(struct ipts_context *ipts)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_READY_FOR_DATA, NULL, 0);
+	if (ret)
+		dev_err(ipts->dev, "READY_FOR_DATA: send failed: %d\n", ret);
+
+	return ret;
+}
+
+int ipts_control_wait_data(struct ipts_context *ipts, bool shutdown)
+{
+	int ret = 0;
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!shutdown)
+		ret = ipts_cmd_recv_timeout(ipts, IPTS_CMD_READY_FOR_DATA, &rsp, 0);
+	else
+		ret = ipts_cmd_recv(ipts, IPTS_CMD_READY_FOR_DATA, &rsp);
+
+	if (ret) {
+		if (ret != -EAGAIN)
+			dev_err(ipts->dev, "READY_FOR_DATA: recv failed: %d\n", ret);
+
+		return ret;
+	}
+
+	/*
+	 * During shutdown, it is possible that the sensor has already been disabled.
+	 */
+	if (rsp.status == IPTS_STATUS_SENSOR_DISABLED)
+		return 0;
+
+	if (rsp.status == IPTS_STATUS_TIMEOUT)
+		return -EAGAIN;
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "READY_FOR_DATA: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	return 0;
+}
+
+int ipts_control_send_feedback(struct ipts_context *ipts, u32 buffer)
+{
+	int ret = 0;
+	struct ipts_feedback cmd = { 0 };
+	struct ipts_response rsp = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	cmd.buffer = buffer;
+
+	ret = ipts_cmd_send(ipts, IPTS_CMD_FEEDBACK, &cmd, sizeof(cmd));
+	if (ret) {
+		dev_err(ipts->dev, "FEEDBACK: send failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_cmd_recv(ipts, IPTS_CMD_FEEDBACK, &rsp);
+	if (ret) {
+		dev_err(ipts->dev, "FEEDBACK: recv failed: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * We don't know what feedback data looks like so we are sending zeros.
+	 * See also ipts_control_refill_buffer.
+	 */
+	if (rsp.status == IPTS_STATUS_INVALID_PARAMS)
+		return 0;
+
+	if (rsp.status != IPTS_STATUS_SUCCESS) {
+		dev_err(ipts->dev, "FEEDBACK: cmd failed: %d\n", rsp.status);
+		return -EBADR;
+	}
+
+	return 0;
+}
+
+int ipts_control_hid2me_feedback(struct ipts_context *ipts, enum ipts_feedback_cmd_type cmd,
+				 enum ipts_feedback_data_type type, void *data, size_t size)
+{
+	struct ipts_feedback_header *header = NULL;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!ipts->resources.hid2me.address)
+		return -EFAULT;
+
+	memset(ipts->resources.hid2me.address, 0, ipts->resources.hid2me.size);
+	header = (struct ipts_feedback_header *)ipts->resources.hid2me.address;
+
+	header->cmd_type = cmd;
+	header->data_type = type;
+	header->size = size;
+	header->buffer = IPTS_HID2ME_BUFFER;
+
+	if (size + sizeof(*header) > ipts->resources.hid2me.size)
+		return -EINVAL;
+
+	if (data && size > 0)
+		memcpy(header->payload, data, size);
+
+	return ipts_control_send_feedback(ipts, IPTS_HID2ME_BUFFER);
+}
+
+static inline int ipts_control_reset_sensor(struct ipts_context *ipts)
+{
+	return ipts_control_hid2me_feedback(ipts, IPTS_FEEDBACK_CMD_TYPE_SOFT_RESET,
+					    IPTS_FEEDBACK_DATA_TYPE_VENDOR, NULL, 0);
+}
+
+int ipts_control_start(struct ipts_context *ipts)
+{
+	int ret = 0;
+	struct ipts_device_info info = { 0 };
+
+	if (!ipts)
+		return -EFAULT;
+
+	dev_info(ipts->dev, "Starting IPTS\n");
+
+	ret = ipts_control_get_device_info(ipts, &info);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to get device info: %d\n", ret);
+		return ret;
+	}
+
+	ipts->info = info;
+
+	ret = ipts_resources_init(&ipts->resources, ipts->dev, info.data_size, info.feedback_size);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to allocate buffers: %d", ret);
+		return ret;
+	}
+
+	dev_info(ipts->dev, "IPTS EDS Version: %d\n", info.intf_eds);
+
+	/*
+	 * Handle newer devices
+	 */
+	if (info.intf_eds > 1) {
+		/*
+		 * Fetching the descriptor will only work on newer devices.
+		 * For older devices, a fallback descriptor will be used.
+		 */
+		ret = ipts_control_get_descriptor(ipts);
+		if (ret) {
+			dev_err(ipts->dev, "Failed to fetch HID descriptor: %d\n", ret);
+			return ret;
+		}
+
+		/*
+		 * Newer devices can be directly initialized in doorbell mode.
+		 */
+		ipts->mode = IPTS_MODE_DOORBELL;
+	}
+
+	ret = ipts_control_set_mode(ipts, ipts->mode);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to set mode: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_control_set_mem_window(ipts, &ipts->resources);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to set memory window: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_receiver_start(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to start receiver: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_control_request_data(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to request data: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_hid_init(ipts, info);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to initialize HID device: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int _ipts_control_stop(struct ipts_context *ipts)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	dev_info(ipts->dev, "Stopping IPTS\n");
+
+	ret = ipts_receiver_stop(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to stop receiver: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_control_reset_sensor(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to reset sensor: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_resources_free(&ipts->resources);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to free resources: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int ipts_control_stop(struct ipts_context *ipts)
+{
+	int ret = 0;
+
+	ret = _ipts_control_stop(ipts);
+	if (ret)
+		return ret;
+
+	ret = ipts_hid_free(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to free HID device: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int ipts_control_restart(struct ipts_context *ipts)
+{
+	int ret = 0;
+
+	ret = _ipts_control_stop(ipts);
+	if (ret)
+		return ret;
+
+	/*
+	 * Give the sensor some time to come back from resetting
+	 */
+	msleep(1000);
+
+	ret = ipts_control_start(ipts);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff --git a/drivers/hid/ipts/control.h b/drivers/hid/ipts/control.h
new file mode 100644
index 000000000000..744bb92d682a
--- /dev/null
+++ b/drivers/hid/ipts/control.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_CONTROL_H
+#define IPTS_CONTROL_H
+
+#include <linux/types.h>
+
+#include "context.h"
+#include "spec-data.h"
+#include "spec-device.h"
+
+/*
+ * ipts_control_request_flush() - Stop the data flow.
+ * @ipts: The IPTS driver context.
+ *
+ * Runs the command to stop the data flow on the device.
+ * All outstanding data needs to be acknowledged using feedback before the command will return.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_request_flush(struct ipts_context *ipts);
+
+/*
+ * ipts_control_wait_flush() - Wait until data flow has been stopped.
+ * @ipts: The IPTS driver context.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_wait_flush(struct ipts_context *ipts);
+
+/*
+ * ipts_control_wait_flush() - Notify the device that the driver can receive new data.
+ * @ipts: The IPTS driver context.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_request_data(struct ipts_context *ipts);
+
+/*
+ * ipts_control_wait_data() - Wait until new data is available.
+ * @ipts: The IPTS driver context.
+ * @block: Whether to block execution until data is available.
+ *
+ * In doorbell mode, this function will never return while the data flow is active. Instead,
+ * the doorbell will be incremented when new data is available.
+ *
+ * Returns: 0 on success, <0 on error, -EAGAIN if no data is available.
+ */
+int ipts_control_wait_data(struct ipts_context *ipts, bool block);
+
+/*
+ * ipts_control_send_feedback() - Submits a feedback buffer to the device.
+ * @ipts: The IPTS driver context.
+ * @buffer: The ID of the buffer containing feedback data.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_send_feedback(struct ipts_context *ipts, u32 buffer);
+
+/*
+ * ipts_control_hid2me_feedback() - Sends HID2ME feedback, a special type of feedback.
+ * @ipts: The IPTS driver context.
+ * @cmd: The command that will be run on the device.
+ * @type: The type of the payload that is sent to the device.
+ * @data: The payload of the feedback command.
+ * @size: The size of the payload.
+ *
+ * HID2ME feedback is a special type of feedback, because it allows interfacing with
+ * the HID API of the device at any moment, without requiring a buffer that has to
+ * be acknowledged.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_hid2me_feedback(struct ipts_context *ipts, enum ipts_feedback_cmd_type cmd,
+				 enum ipts_feedback_data_type type, void *data, size_t size);
+
+/*
+ * ipts_control_refill_buffer() - Acknowledges that data in a buffer has been processed.
+ * @ipts: The IPTS driver context.
+ * @buffer: The buffer that has been processed and can be refilled.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+static inline int ipts_control_refill_buffer(struct ipts_context *ipts, u32 buffer)
+{
+	/*
+	 * IPTS expects structured data in the feedback buffer matching the buffer that will be
+	 * refilled. We don't know what that data looks like, so we just keep the buffer empty.
+	 * This results in an INVALID_PARAMS error, but the buffer gets refilled without an issue.
+	 * Sending a minimal structure with the buffer ID fixes the error, but breaks refilling
+	 * the buffers on some devices.
+	 */
+
+	return ipts_control_send_feedback(ipts, buffer);
+}
+
+/*
+ * ipts_control_start() - Initialized the device and starts the data flow.
+ * @ipts: The IPTS driver context.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_start(struct ipts_context *ipts);
+
+/*
+ * ipts_control_stop() - Stops the data flow and resets the device.
+ * @ipts: The IPTS driver context.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_stop(struct ipts_context *ipts);
+
+/*
+ * ipts_control_restart() - Stops the device and starts it again.
+ * @ipts: The IPTS driver context.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_control_restart(struct ipts_context *ipts);
+
+#endif /* IPTS_CONTROL_H */
diff --git a/drivers/hid/ipts/desc.h b/drivers/hid/ipts/desc.h
new file mode 100644
index 000000000000..c058974a03a1
--- /dev/null
+++ b/drivers/hid/ipts/desc.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2022-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_DESC_H
+#define IPTS_DESC_H
+
+#include <linux/types.h>
+
+#define IPTS_HID_REPORT_SINGLETOUCH 64
+#define IPTS_HID_REPORT_DATA	    65
+#define IPTS_HID_REPORT_SET_MODE    66
+
+#define IPTS_HID_REPORT_DATA_SIZE 7485
+
+/*
+ * HID descriptor for singletouch data.
+ * This descriptor should be present on all IPTS devices.
+ */
+static const u8 ipts_singletouch_descriptor[] = {
+	0x05, 0x0D,	  /*  Usage Page (Digitizer),            */
+	0x09, 0x04,	  /*  Usage (Touchscreen),               */
+	0xA1, 0x01,	  /*  Collection (Application),          */
+	0x85, 0x40,	  /*      Report ID (64),                */
+	0x09, 0x42,	  /*      Usage (Tip Switch),            */
+	0x15, 0x00,	  /*      Logical Minimum (0),           */
+	0x25, 0x01,	  /*      Logical Maximum (1),           */
+	0x75, 0x01,	  /*      Report Size (1),               */
+	0x95, 0x01,	  /*      Report Count (1),              */
+	0x81, 0x02,	  /*      Input (Variable),              */
+	0x95, 0x07,	  /*      Report Count (7),              */
+	0x81, 0x03,	  /*      Input (Constant, Variable),    */
+	0x05, 0x01,	  /*      Usage Page (Desktop),          */
+	0x09, 0x30,	  /*      Usage (X),                     */
+	0x75, 0x10,	  /*      Report Size (16),              */
+	0x95, 0x01,	  /*      Report Count (1),              */
+	0xA4,		  /*      Push,                          */
+	0x55, 0x0E,	  /*      Unit Exponent (14),            */
+	0x65, 0x11,	  /*      Unit (Centimeter),             */
+	0x46, 0x76, 0x0B, /*      Physical Maximum (2934),       */
+	0x26, 0xFF, 0x7F, /*      Logical Maximum (32767),       */
+	0x81, 0x02,	  /*      Input (Variable),              */
+	0x09, 0x31,	  /*      Usage (Y),                     */
+	0x46, 0x74, 0x06, /*      Physical Maximum (1652),       */
+	0x26, 0xFF, 0x7F, /*      Logical Maximum (32767),       */
+	0x81, 0x02,	  /*      Input (Variable),              */
+	0xB4,		  /*      Pop,                           */
+	0xC0,		  /*  End Collection                     */
+};
+
+/*
+ * Fallback HID descriptor for older devices that do not have
+ * the ability to query their HID descriptor.
+ */
+static const u8 ipts_fallback_descriptor[] = {
+	0x05, 0x0D,	  /*  Usage Page (Digitizer),            */
+	0x09, 0x0F,	  /*  Usage (Capacitive Hm Digitizer),   */
+	0xA1, 0x01,	  /*  Collection (Application),          */
+	0x85, 0x41,	  /*      Report ID (65),                */
+	0x09, 0x56,	  /*      Usage (Scan Time),             */
+	0x95, 0x01,	  /*      Report Count (1),              */
+	0x75, 0x10,	  /*      Report Size (16),              */
+	0x81, 0x02,	  /*      Input (Variable),              */
+	0x09, 0x61,	  /*      Usage (Gesture Char Quality),  */
+	0x75, 0x08,	  /*      Report Size (8),               */
+	0x96, 0x3D, 0x1D, /*      Report Count (7485),           */
+	0x81, 0x03,	  /*      Input (Constant, Variable),    */
+	0x85, 0x42,	  /*      Report ID (66),                */
+	0x06, 0x00, 0xFF, /*      Usage Page (FF00h),            */
+	0x09, 0xC8,	  /*      Usage (C8h),                   */
+	0x75, 0x08,	  /*      Report Size (8),               */
+	0x95, 0x01,	  /*      Report Count (1),              */
+	0xB1, 0x02,	  /*      Feature (Variable),            */
+	0xC0,		  /*  End Collection,                    */
+};
+
+#endif /* IPTS_DESC_H */
diff --git a/drivers/hid/ipts/hid.c b/drivers/hid/ipts/hid.c
new file mode 100644
index 000000000000..a2471219615b
--- /dev/null
+++ b/drivers/hid/ipts/hid.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2022-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/completion.h>
+#include <linux/gfp.h>
+#include <linux/hid.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "context.h"
+#include "control.h"
+#include "desc.h"
+#include "hid.h"
+#include "spec-data.h"
+#include "spec-device.h"
+#include "spec-hid.h"
+
+static int ipts_hid_start(struct hid_device *hid)
+{
+	return 0;
+}
+
+static void ipts_hid_stop(struct hid_device *hid)
+{
+}
+
+static int ipts_hid_switch_mode(struct ipts_context *ipts, enum ipts_mode mode)
+{
+	if (!ipts)
+		return -EFAULT;
+
+	if (ipts->mode == mode)
+		return 0;
+
+	/*
+	 * This is only allowed on older devices.
+	 */
+	if (ipts->info.intf_eds > 1)
+		return 0;
+
+	ipts->mode = mode;
+	return ipts_control_restart(ipts);
+}
+
+static int ipts_hid_parse(struct hid_device *hid)
+{
+	int ret = 0;
+	struct ipts_context *ipts = NULL;
+
+	bool has_native_descriptor = false;
+
+	u8 *buffer = NULL;
+	size_t size = 0;
+
+	if (!hid)
+		return -ENODEV;
+
+	ipts = hid->driver_data;
+
+	if (!ipts)
+		return -EFAULT;
+
+	size = sizeof(ipts_singletouch_descriptor);
+	has_native_descriptor = ipts->descriptor.address && ipts->descriptor.size > 0;
+
+	if (has_native_descriptor)
+		size += ipts->descriptor.size;
+	else
+		size += sizeof(ipts_fallback_descriptor);
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	memcpy(buffer, ipts_singletouch_descriptor, sizeof(ipts_singletouch_descriptor));
+
+	if (has_native_descriptor) {
+		memcpy(&buffer[sizeof(ipts_singletouch_descriptor)], ipts->descriptor.address,
+		       ipts->descriptor.size);
+	} else {
+		memcpy(&buffer[sizeof(ipts_singletouch_descriptor)], ipts_fallback_descriptor,
+		       sizeof(ipts_fallback_descriptor));
+	}
+
+	ret = hid_parse_report(hid, buffer, size);
+	kfree(buffer);
+
+	if (ret) {
+		dev_err(ipts->dev, "Failed to parse HID descriptor: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int ipts_hid_get_feature(struct ipts_context *ipts, unsigned char reportnum, __u8 *buf,
+				size_t size, enum ipts_feedback_data_type type)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!buf)
+		return -EFAULT;
+
+	mutex_lock(&ipts->feature_lock);
+
+	memset(buf, 0, size);
+	buf[0] = reportnum;
+
+	memset(&ipts->feature_report, 0, sizeof(ipts->feature_report));
+	reinit_completion(&ipts->feature_event);
+
+	ret = ipts_control_hid2me_feedback(ipts, IPTS_FEEDBACK_CMD_TYPE_NONE, type, buf, size);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to send hid2me feedback: %d\n", ret);
+		goto out;
+	}
+
+	ret = wait_for_completion_timeout(&ipts->feature_event, msecs_to_jiffies(5000));
+	if (ret == 0) {
+		dev_warn(ipts->dev, "GET_FEATURES timed out!\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	if (!ipts->feature_report.address) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ipts->feature_report.size > size) {
+		ret = -ETOOSMALL;
+		goto out;
+	}
+
+	ret = ipts->feature_report.size;
+	memcpy(buf, ipts->feature_report.address, ipts->feature_report.size);
+
+out:
+	mutex_unlock(&ipts->feature_lock);
+	return ret;
+}
+
+static int ipts_hid_set_feature(struct ipts_context *ipts, unsigned char reportnum, __u8 *buf,
+				size_t size, enum ipts_feedback_data_type type)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!buf)
+		return -EFAULT;
+
+	buf[0] = reportnum;
+
+	ret = ipts_control_hid2me_feedback(ipts, IPTS_FEEDBACK_CMD_TYPE_NONE, type, buf, size);
+	if (ret)
+		dev_err(ipts->dev, "Failed to send hid2me feedback: %d\n", ret);
+
+	return ret;
+}
+
+static int ipts_hid_raw_request(struct hid_device *hid, unsigned char reportnum, __u8 *buf,
+				size_t size, unsigned char rtype, int reqtype)
+{
+	int ret = 0;
+	struct ipts_context *ipts = NULL;
+
+	enum ipts_feedback_data_type type = IPTS_FEEDBACK_DATA_TYPE_VENDOR;
+
+	if (!hid)
+		return -ENODEV;
+
+	ipts = hid->driver_data;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!buf)
+		return -EFAULT;
+
+	if (rtype == HID_OUTPUT_REPORT && reqtype == HID_REQ_SET_REPORT)
+		type = IPTS_FEEDBACK_DATA_TYPE_OUTPUT_REPORT;
+	else if (rtype == HID_FEATURE_REPORT && reqtype == HID_REQ_GET_REPORT)
+		type = IPTS_FEEDBACK_DATA_TYPE_GET_FEATURES;
+	else if (rtype == HID_FEATURE_REPORT && reqtype == HID_REQ_SET_REPORT)
+		type = IPTS_FEEDBACK_DATA_TYPE_SET_FEATURES;
+	else
+		return -EIO;
+
+	// Implemente mode switching report for older devices without native HID support
+	if (type == IPTS_FEEDBACK_DATA_TYPE_SET_FEATURES && reportnum == IPTS_HID_REPORT_SET_MODE) {
+		ret = ipts_hid_switch_mode(ipts, buf[1]);
+		if (ret) {
+			dev_err(ipts->dev, "Failed to switch modes: %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (reqtype == HID_REQ_GET_REPORT)
+		return ipts_hid_get_feature(ipts, reportnum, buf, size, type);
+	else
+		return ipts_hid_set_feature(ipts, reportnum, buf, size, type);
+}
+
+static int ipts_hid_output_report(struct hid_device *hid, __u8 *data, size_t size)
+{
+	struct ipts_context *ipts = NULL;
+
+	if (!hid)
+		return -ENODEV;
+
+	ipts = hid->driver_data;
+
+	return ipts_control_hid2me_feedback(ipts, IPTS_FEEDBACK_CMD_TYPE_NONE,
+					    IPTS_FEEDBACK_DATA_TYPE_OUTPUT_REPORT, data, size);
+}
+
+static struct hid_ll_driver ipts_hid_driver = {
+	.start = ipts_hid_start,
+	.stop = ipts_hid_stop,
+	.open = ipts_hid_start,
+	.close = ipts_hid_stop,
+	.parse = ipts_hid_parse,
+	.raw_request = ipts_hid_raw_request,
+	.output_report = ipts_hid_output_report,
+};
+
+int ipts_hid_input_data(struct ipts_context *ipts, u32 buffer)
+{
+	u8 *temp = NULL;
+	struct ipts_hid_header *frame = NULL;
+	struct ipts_data_header *header = NULL;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (!ipts->hid)
+		return -ENODEV;
+
+	header = (struct ipts_data_header *)ipts->resources.data[buffer].address;
+
+	temp = ipts->resources.report.address;
+	memset(temp, 0, ipts->resources.report.size);
+
+	if (!header)
+		return -EFAULT;
+
+	if (header->size == 0)
+		return 0;
+
+	if (header->type == IPTS_DATA_TYPE_HID)
+		return hid_input_report(ipts->hid, HID_INPUT_REPORT, header->data, header->size, 1);
+
+	if (header->type == IPTS_DATA_TYPE_GET_FEATURES) {
+		ipts->feature_report.address = header->data;
+		ipts->feature_report.size = header->size;
+
+		complete_all(&ipts->feature_event);
+		return 0;
+	}
+
+	if (header->type != IPTS_DATA_TYPE_FRAME)
+		return 0;
+
+	if (header->size + 3 + sizeof(struct ipts_hid_header) > IPTS_HID_REPORT_DATA_SIZE)
+		return -ERANGE;
+
+	/*
+	 * Synthesize a HID report matching the devices that natively send HID reports
+	 */
+	temp[0] = IPTS_HID_REPORT_DATA;
+
+	frame = (struct ipts_hid_header *)&temp[3];
+	frame->type = IPTS_HID_FRAME_TYPE_RAW;
+	frame->size = header->size + sizeof(*frame);
+
+	memcpy(frame->data, header->data, header->size);
+
+	return hid_input_report(ipts->hid, HID_INPUT_REPORT, temp, IPTS_HID_REPORT_DATA_SIZE, 1);
+}
+
+int ipts_hid_init(struct ipts_context *ipts, struct ipts_device_info info)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (ipts->hid)
+		return 0;
+
+	ipts->hid = hid_allocate_device();
+	if (IS_ERR(ipts->hid)) {
+		int err = PTR_ERR(ipts->hid);
+
+		dev_err(ipts->dev, "Failed to allocate HID device: %d\n", err);
+		return err;
+	}
+
+	ipts->hid->driver_data = ipts;
+	ipts->hid->dev.parent = ipts->dev;
+	ipts->hid->ll_driver = &ipts_hid_driver;
+
+	ipts->hid->vendor = info.vendor;
+	ipts->hid->product = info.product;
+	ipts->hid->group = HID_GROUP_GENERIC;
+
+	snprintf(ipts->hid->name, sizeof(ipts->hid->name), "IPTS %04X:%04X", info.vendor,
+		 info.product);
+
+	ret = hid_add_device(ipts->hid);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to add HID device: %d\n", ret);
+		ipts_hid_free(ipts);
+		return ret;
+	}
+
+	return 0;
+}
+
+int ipts_hid_free(struct ipts_context *ipts)
+{
+	if (!ipts)
+		return -EFAULT;
+
+	if (!ipts->hid)
+		return 0;
+
+	hid_destroy_device(ipts->hid);
+	ipts->hid = NULL;
+
+	return 0;
+}
diff --git a/drivers/hid/ipts/hid.h b/drivers/hid/ipts/hid.h
new file mode 100644
index 000000000000..62bf3cd48608
--- /dev/null
+++ b/drivers/hid/ipts/hid.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2022-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_HID_H
+#define IPTS_HID_H
+
+#include <linux/types.h>
+
+#include "context.h"
+#include "spec-device.h"
+
+int ipts_hid_input_data(struct ipts_context *ipts, u32 buffer);
+
+int ipts_hid_init(struct ipts_context *ipts, struct ipts_device_info info);
+int ipts_hid_free(struct ipts_context *ipts);
+
+#endif /* IPTS_HID_H */
diff --git a/drivers/hid/ipts/main.c b/drivers/hid/ipts/main.c
new file mode 100644
index 000000000000..0f20c6c08c38
--- /dev/null
+++ b/drivers/hid/ipts/main.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/mei_cl_bus.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+#include "context.h"
+#include "control.h"
+#include "mei.h"
+#include "receiver.h"
+#include "spec-device.h"
+
+/*
+ * The MEI client ID for IPTS functionality.
+ */
+#define IPTS_ID UUID_LE(0x3e8d0870, 0x271a, 0x4208, 0x8e, 0xb5, 0x9a, 0xcb, 0x94, 0x02, 0xae, 0x04)
+
+static int ipts_set_dma_mask(struct mei_cl_device *cldev)
+{
+	if (!cldev)
+		return -EFAULT;
+
+	if (!dma_coerce_mask_and_coherent(&cldev->dev, DMA_BIT_MASK(64)))
+		return 0;
+
+	return dma_coerce_mask_and_coherent(&cldev->dev, DMA_BIT_MASK(32));
+}
+
+static int ipts_probe(struct mei_cl_device *cldev, const struct mei_cl_device_id *id)
+{
+	int ret = 0;
+	struct ipts_context *ipts = NULL;
+
+	if (!cldev)
+		return -EFAULT;
+
+	ret = ipts_set_dma_mask(cldev);
+	if (ret) {
+		dev_err(&cldev->dev, "Failed to set DMA mask for IPTS: %d\n", ret);
+		return ret;
+	}
+
+	ret = mei_cldev_enable(cldev);
+	if (ret) {
+		dev_err(&cldev->dev, "Failed to enable MEI device: %d\n", ret);
+		return ret;
+	}
+
+	ipts = devm_kzalloc(&cldev->dev, sizeof(*ipts), GFP_KERNEL);
+	if (!ipts) {
+		mei_cldev_disable(cldev);
+		return -ENOMEM;
+	}
+
+	ret = ipts_mei_init(&ipts->mei, cldev);
+	if (ret) {
+		dev_err(&cldev->dev, "Failed to init MEI bus logic: %d\n", ret);
+		return ret;
+	}
+
+	ipts->dev = &cldev->dev;
+	ipts->mode = IPTS_MODE_EVENT;
+
+	mutex_init(&ipts->feature_lock);
+	init_completion(&ipts->feature_event);
+
+	mei_cldev_set_drvdata(cldev, ipts);
+
+	ret = ipts_control_start(ipts);
+	if (ret) {
+		dev_err(&cldev->dev, "Failed to start IPTS: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ipts_remove(struct mei_cl_device *cldev)
+{
+	int ret = 0;
+	struct ipts_context *ipts = NULL;
+
+	if (!cldev) {
+		pr_err("MEI device is NULL!");
+		return;
+	}
+
+	ipts = mei_cldev_get_drvdata(cldev);
+
+	ret = ipts_control_stop(ipts);
+	if (ret)
+		dev_err(&cldev->dev, "Failed to stop IPTS: %d\n", ret);
+
+	mei_cldev_disable(cldev);
+}
+
+static struct mei_cl_device_id ipts_device_id_table[] = {
+	{ .uuid = IPTS_ID, .version = MEI_CL_VERSION_ANY },
+	{},
+};
+MODULE_DEVICE_TABLE(mei, ipts_device_id_table);
+
+static struct mei_cl_driver ipts_driver = {
+	.id_table = ipts_device_id_table,
+	.name = "ipts",
+	.probe = ipts_probe,
+	.remove = ipts_remove,
+};
+module_mei_cl_driver(ipts_driver);
+
+MODULE_DESCRIPTION("IPTS touchscreen driver");
+MODULE_AUTHOR("Dorian Stoll <dorian.stoll@tmsp.io>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/ipts/mei.c b/drivers/hid/ipts/mei.c
new file mode 100644
index 000000000000..26666fd99b0c
--- /dev/null
+++ b/drivers/hid/ipts/mei.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/mei_cl_bus.h>
+#include <linux/printk.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "context.h"
+#include "mei.h"
+
+static void locked_list_add(struct list_head *new, struct list_head *head,
+			    struct rw_semaphore *lock)
+{
+	down_write(lock);
+	list_add(new, head);
+	up_write(lock);
+}
+
+static void locked_list_del(struct list_head *entry, struct rw_semaphore *lock)
+{
+	down_write(lock);
+	list_del(entry);
+	up_write(lock);
+}
+
+static void ipts_mei_incoming(struct mei_cl_device *cldev)
+{
+	ssize_t ret = 0;
+	struct ipts_mei_message *entry = NULL;
+	struct ipts_context *ipts = NULL;
+
+	if (!cldev) {
+		pr_err("MEI device is NULL!");
+		return;
+	}
+
+	ipts = mei_cldev_get_drvdata(cldev);
+	if (!ipts) {
+		pr_err("IPTS driver context is NULL!");
+		return;
+	}
+
+	entry = devm_kzalloc(ipts->dev, sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return;
+
+	INIT_LIST_HEAD(&entry->list);
+
+	do {
+		ret = mei_cldev_recv(cldev, (u8 *)&entry->rsp, sizeof(entry->rsp));
+	} while (ret == -EINTR);
+
+	if (ret < 0) {
+		dev_err(ipts->dev, "Error while reading response: %ld\n", ret);
+		return;
+	}
+
+	if (ret == 0) {
+		dev_err(ipts->dev, "Received empty response\n");
+		return;
+	}
+
+	locked_list_add(&entry->list, &ipts->mei.messages, &ipts->mei.message_lock);
+	wake_up_all(&ipts->mei.message_queue);
+}
+
+static int ipts_mei_search(struct ipts_mei *mei, enum ipts_command_code code,
+			   struct ipts_response *rsp)
+{
+	struct ipts_mei_message *entry = NULL;
+
+	if (!mei)
+		return -EFAULT;
+
+	if (!rsp)
+		return -EFAULT;
+
+	down_read(&mei->message_lock);
+
+	/*
+	 * Iterate over the list of received messages, and check if there is one
+	 * matching the requested command code.
+	 */
+	list_for_each_entry(entry, &mei->messages, list) {
+		if (entry->rsp.cmd == code)
+			break;
+	}
+
+	up_read(&mei->message_lock);
+
+	/*
+	 * If entry is not the list head, this means that the loop above has been stopped early,
+	 * and that we found a matching element. We drop the message from the list and return it.
+	 */
+	if (!list_entry_is_head(entry, &mei->messages, list)) {
+		locked_list_del(&entry->list, &mei->message_lock);
+
+		*rsp = entry->rsp;
+		devm_kfree(&mei->cldev->dev, entry);
+
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+
+int ipts_mei_recv(struct ipts_mei *mei, enum ipts_command_code code, struct ipts_response *rsp,
+		  u64 timeout)
+{
+	int ret = 0;
+
+	if (!mei)
+		return -EFAULT;
+
+	/*
+	 * A timeout of 0 means check and return immideately.
+	 */
+	if (timeout == 0)
+		return ipts_mei_search(mei, code, rsp);
+
+	/*
+	 * A timeout of less than 0 means to wait forever.
+	 */
+	if (timeout < 0) {
+		wait_event(mei->message_queue, ipts_mei_search(mei, code, rsp) == 0);
+		return 0;
+	}
+
+	ret = wait_event_timeout(mei->message_queue, ipts_mei_search(mei, code, rsp) == 0,
+				 msecs_to_jiffies(timeout));
+
+	if (ret > 0)
+		return 0;
+
+	return -EAGAIN;
+}
+
+int ipts_mei_send(struct ipts_mei *mei, void *data, size_t length)
+{
+	int ret = 0;
+
+	if (!mei)
+		return -EFAULT;
+
+	if (!mei->cldev)
+		return -EFAULT;
+
+	if (!data)
+		return -EFAULT;
+
+	do {
+		ret = mei_cldev_send(mei->cldev, (u8 *)data, length);
+	} while (ret == -EINTR);
+
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int ipts_mei_init(struct ipts_mei *mei, struct mei_cl_device *cldev)
+{
+	if (!mei)
+		return -EFAULT;
+
+	if (!cldev)
+		return -EFAULT;
+
+	mei->cldev = cldev;
+
+	INIT_LIST_HEAD(&mei->messages);
+	init_waitqueue_head(&mei->message_queue);
+	init_rwsem(&mei->message_lock);
+
+	mei_cldev_register_rx_cb(cldev, ipts_mei_incoming);
+
+	return 0;
+}
diff --git a/drivers/hid/ipts/mei.h b/drivers/hid/ipts/mei.h
new file mode 100644
index 000000000000..eadacae54c40
--- /dev/null
+++ b/drivers/hid/ipts/mei.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_MEI_H
+#define IPTS_MEI_H
+
+#include <linux/list.h>
+#include <linux/mei_cl_bus.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "spec-device.h"
+
+struct ipts_mei_message {
+	struct list_head list;
+	struct ipts_response rsp;
+};
+
+struct ipts_mei {
+	struct mei_cl_device *cldev;
+
+	struct list_head messages;
+
+	wait_queue_head_t message_queue;
+	struct rw_semaphore message_lock;
+};
+
+/*
+ * ipts_mei_recv() - Receive data from a MEI device.
+ * @mei: The IPTS MEI device context.
+ * @code: The IPTS command code to look for.
+ * @rsp: The address that the received data will be copied to.
+ * @timeout: How many milliseconds the function will wait at most.
+ *
+ * A negative timeout means to wait forever.
+ *
+ * Returns: 0 on success, <0 on error, -EAGAIN if no response has been received.
+ */
+int ipts_mei_recv(struct ipts_mei *mei, enum ipts_command_code code, struct ipts_response *rsp,
+		  u64 timeout);
+
+/*
+ * ipts_mei_send() - Send data to a MEI device.
+ * @ipts: The IPTS MEI device context.
+ * @data: The data to send.
+ * @size: The size of the data.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_mei_send(struct ipts_mei *mei, void *data, size_t length);
+
+/*
+ * ipts_mei_init() - Initialize the MEI device context.
+ * @mei: The MEI device context to initialize.
+ * @cldev: The MEI device the context will be bound to.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_mei_init(struct ipts_mei *mei, struct mei_cl_device *cldev);
+
+#endif /* IPTS_MEI_H */
diff --git a/drivers/hid/ipts/receiver.c b/drivers/hid/ipts/receiver.c
new file mode 100644
index 000000000000..f56e9ed32d57
--- /dev/null
+++ b/drivers/hid/ipts/receiver.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/time64.h>
+#include <linux/timekeeping.h>
+#include <linux/types.h>
+
+#include "cmd.h"
+#include "context.h"
+#include "control.h"
+#include "hid.h"
+#include "resources.h"
+#include "spec-device.h"
+#include "thread.h"
+
+static void ipts_receiver_next_doorbell(struct ipts_context *ipts)
+{
+	u32 *doorbell = (u32 *)ipts->resources.doorbell.address;
+	*doorbell = *doorbell + 1;
+}
+
+static u32 ipts_receiver_current_doorbell(struct ipts_context *ipts)
+{
+	u32 *doorbell = (u32 *)ipts->resources.doorbell.address;
+	return *doorbell;
+}
+
+static void ipts_receiver_backoff(time64_t last, u32 n)
+{
+	/*
+	 * If the last change was less than n seconds ago,
+	 * sleep for a shorter period so that new data can be
+	 * processed quickly. If there was no change for more than
+	 * n seconds, sleep longer to avoid wasting CPU cycles.
+	 */
+	if (last + n > ktime_get_seconds())
+		usleep_range(1 * USEC_PER_MSEC, 5 * USEC_PER_MSEC);
+	else
+		msleep(200);
+}
+
+static int ipts_receiver_event_loop(struct ipts_thread *thread)
+{
+	int ret = 0;
+	u32 buffer = 0;
+
+	struct ipts_context *ipts = NULL;
+	time64_t last = ktime_get_seconds();
+
+	if (!thread)
+		return -EFAULT;
+
+	ipts = thread->data;
+
+	if (!ipts)
+		return -EFAULT;
+
+	dev_info(ipts->dev, "IPTS running in event mode\n");
+
+	while (!ipts_thread_should_stop(thread)) {
+		for (int i = 0; i < IPTS_BUFFERS; i++) {
+			ret = ipts_control_wait_data(ipts, false);
+			if (ret == -EAGAIN)
+				break;
+
+			if (ret) {
+				dev_err(ipts->dev, "Failed to wait for data: %d\n", ret);
+				continue;
+			}
+
+			buffer = ipts_receiver_current_doorbell(ipts) % IPTS_BUFFERS;
+			ipts_receiver_next_doorbell(ipts);
+
+			ret = ipts_hid_input_data(ipts, buffer);
+			if (ret)
+				dev_err(ipts->dev, "Failed to process buffer: %d\n", ret);
+
+			ret = ipts_control_refill_buffer(ipts, buffer);
+			if (ret)
+				dev_err(ipts->dev, "Failed to send feedback: %d\n", ret);
+
+			ret = ipts_control_request_data(ipts);
+			if (ret)
+				dev_err(ipts->dev, "Failed to request data: %d\n", ret);
+
+			last = ktime_get_seconds();
+		}
+
+		ipts_receiver_backoff(last, 5);
+	}
+
+	ret = ipts_control_request_flush(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to request flush: %d\n", ret);
+		return ret;
+	}
+
+	ret = ipts_control_wait_data(ipts, true);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to wait for data: %d\n", ret);
+
+		if (ret != -EAGAIN)
+			return ret;
+		else
+			return 0;
+	}
+
+	ret = ipts_control_wait_flush(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to wait for flush: %d\n", ret);
+
+		if (ret != -EAGAIN)
+			return ret;
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+static int ipts_receiver_doorbell_loop(struct ipts_thread *thread)
+{
+	int ret = 0;
+	u32 buffer = 0;
+
+	u32 doorbell = 0;
+	u32 lastdb = 0;
+
+	struct ipts_context *ipts = NULL;
+	time64_t last = ktime_get_seconds();
+
+	if (!thread)
+		return -EFAULT;
+
+	ipts = thread->data;
+
+	if (!ipts)
+		return -EFAULT;
+
+	dev_info(ipts->dev, "IPTS running in doorbell mode\n");
+
+	while (true) {
+		if (ipts_thread_should_stop(thread)) {
+			ret = ipts_control_request_flush(ipts);
+			if (ret) {
+				dev_err(ipts->dev, "Failed to request flush: %d\n", ret);
+				return ret;
+			}
+		}
+
+		doorbell = ipts_receiver_current_doorbell(ipts);
+
+		/*
+		 * After filling up one of the data buffers, IPTS will increment
+		 * the doorbell. The value of the doorbell stands for the *next*
+		 * buffer that IPTS is going to fill.
+		 */
+		while (lastdb != doorbell) {
+			buffer = lastdb % IPTS_BUFFERS;
+
+			ret = ipts_hid_input_data(ipts, buffer);
+			if (ret)
+				dev_err(ipts->dev, "Failed to process buffer: %d\n", ret);
+
+			ret = ipts_control_refill_buffer(ipts, buffer);
+			if (ret)
+				dev_err(ipts->dev, "Failed to send feedback: %d\n", ret);
+
+			last = ktime_get_seconds();
+			lastdb++;
+		}
+
+		if (ipts_thread_should_stop(thread))
+			break;
+
+		ipts_receiver_backoff(last, 5);
+	}
+
+	ret = ipts_control_wait_data(ipts, true);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to wait for data: %d\n", ret);
+
+		if (ret != -EAGAIN)
+			return ret;
+		else
+			return 0;
+	}
+
+	ret = ipts_control_wait_flush(ipts);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to wait for flush: %d\n", ret);
+
+		if (ret != -EAGAIN)
+			return ret;
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+int ipts_receiver_start(struct ipts_context *ipts)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	if (ipts->mode == IPTS_MODE_EVENT) {
+		ret = ipts_thread_start(&ipts->receiver_loop, ipts_receiver_event_loop, ipts,
+					"ipts_event");
+	} else if (ipts->mode == IPTS_MODE_DOORBELL) {
+		ret = ipts_thread_start(&ipts->receiver_loop, ipts_receiver_doorbell_loop, ipts,
+					"ipts_doorbell");
+	} else {
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		dev_err(ipts->dev, "Failed to start receiver loop: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int ipts_receiver_stop(struct ipts_context *ipts)
+{
+	int ret = 0;
+
+	if (!ipts)
+		return -EFAULT;
+
+	ret = ipts_thread_stop(&ipts->receiver_loop);
+	if (ret) {
+		dev_err(ipts->dev, "Failed to stop receiver loop: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/hid/ipts/receiver.h b/drivers/hid/ipts/receiver.h
new file mode 100644
index 000000000000..96070f34fbca
--- /dev/null
+++ b/drivers/hid/ipts/receiver.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_RECEIVER_H
+#define IPTS_RECEIVER_H
+
+#include "context.h"
+
+int ipts_receiver_start(struct ipts_context *ipts);
+int ipts_receiver_stop(struct ipts_context *ipts);
+
+#endif /* IPTS_RECEIVER_H */
diff --git a/drivers/hid/ipts/resources.c b/drivers/hid/ipts/resources.c
new file mode 100644
index 000000000000..5e924d58c488
--- /dev/null
+++ b/drivers/hid/ipts/resources.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/types.h>
+
+#include "desc.h"
+#include "resources.h"
+#include "spec-device.h"
+
+static int ipts_resources_alloc_buffer(struct ipts_buffer *buffer, struct device *dev, size_t size)
+{
+	if (!buffer)
+		return -EFAULT;
+
+	if (buffer->address)
+		return 0;
+
+	buffer->address = dma_alloc_coherent(dev, size, &buffer->dma_address, GFP_KERNEL);
+
+	if (!buffer->address)
+		return -ENOMEM;
+
+	buffer->size = size;
+	buffer->device = dev;
+
+	return 0;
+}
+
+static void ipts_resources_free_buffer(struct ipts_buffer *buffer)
+{
+	if (!buffer->address)
+		return;
+
+	dma_free_coherent(buffer->device, buffer->size, buffer->address, buffer->dma_address);
+
+	buffer->address = NULL;
+	buffer->size = 0;
+
+	buffer->dma_address = 0;
+	buffer->device = NULL;
+}
+
+int ipts_resources_init(struct ipts_resources *res, struct device *dev, size_t ds, size_t fs)
+{
+	int ret = 0;
+
+	/*
+	 * Some compilers (AOSP clang) complain about a redefined
+	 * variable when this is declared inside of the for loop.
+	 */
+	int i = 0;
+
+	if (!res)
+		return -EFAULT;
+
+	for (i = 0; i < IPTS_BUFFERS; i++) {
+		ret = ipts_resources_alloc_buffer(&res->data[i], dev, ds);
+		if (ret)
+			goto err;
+	}
+
+	for (i = 0; i < IPTS_BUFFERS; i++) {
+		ret = ipts_resources_alloc_buffer(&res->feedback[i], dev, fs);
+		if (ret)
+			goto err;
+	}
+
+	ret = ipts_resources_alloc_buffer(&res->doorbell, dev, sizeof(u32));
+	if (ret)
+		goto err;
+
+	ret = ipts_resources_alloc_buffer(&res->workqueue, dev, sizeof(u32));
+	if (ret)
+		goto err;
+
+	ret = ipts_resources_alloc_buffer(&res->hid2me, dev, fs);
+	if (ret)
+		goto err;
+
+	ret = ipts_resources_alloc_buffer(&res->descriptor, dev, ds + 8);
+	if (ret)
+		goto err;
+
+	if (!res->report.address) {
+		res->report.size = IPTS_HID_REPORT_DATA_SIZE;
+		res->report.address = kzalloc(res->report.size, GFP_KERNEL);
+
+		if (!res->report.address) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+
+	ipts_resources_free(res);
+	return ret;
+}
+
+int ipts_resources_free(struct ipts_resources *res)
+{
+	/*
+	 * Some compilers (AOSP clang) complain about a redefined
+	 * variable when this is declared inside of the for loop.
+	 */
+	int i = 0;
+
+	if (!res)
+		return -EFAULT;
+
+	for (i = 0; i < IPTS_BUFFERS; i++)
+		ipts_resources_free_buffer(&res->data[i]);
+
+	for (i = 0; i < IPTS_BUFFERS; i++)
+		ipts_resources_free_buffer(&res->feedback[i]);
+
+	ipts_resources_free_buffer(&res->doorbell);
+	ipts_resources_free_buffer(&res->workqueue);
+	ipts_resources_free_buffer(&res->hid2me);
+	ipts_resources_free_buffer(&res->descriptor);
+
+	kfree(res->report.address);
+	res->report.address = NULL;
+	res->report.size = 0;
+
+	return 0;
+}
diff --git a/drivers/hid/ipts/resources.h b/drivers/hid/ipts/resources.h
new file mode 100644
index 000000000000..e0c400f420b9
--- /dev/null
+++ b/drivers/hid/ipts/resources.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_RESOURCES_H
+#define IPTS_RESOURCES_H
+
+#include <linux/device.h>
+#include <linux/types.h>
+
+#include "spec-device.h"
+
+struct ipts_buffer {
+	u8 *address;
+	size_t size;
+
+	dma_addr_t dma_address;
+	struct device *device;
+};
+
+struct ipts_resources {
+	struct ipts_buffer data[IPTS_BUFFERS];
+	struct ipts_buffer feedback[IPTS_BUFFERS];
+
+	struct ipts_buffer doorbell;
+	struct ipts_buffer workqueue;
+	struct ipts_buffer hid2me;
+
+	struct ipts_buffer descriptor;
+
+	// Buffer for synthesizing HID reports
+	struct ipts_buffer report;
+};
+
+int ipts_resources_init(struct ipts_resources *res, struct device *dev, size_t ds, size_t fs);
+int ipts_resources_free(struct ipts_resources *res);
+
+#endif /* IPTS_RESOURCES_H */
diff --git a/drivers/hid/ipts/spec-data.h b/drivers/hid/ipts/spec-data.h
new file mode 100644
index 000000000000..e8dd98895a7e
--- /dev/null
+++ b/drivers/hid/ipts/spec-data.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_SPEC_DATA_H
+#define IPTS_SPEC_DATA_H
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+/**
+ * enum ipts_feedback_cmd_type - Commands that can be executed on the sensor through feedback.
+ */
+enum ipts_feedback_cmd_type {
+	IPTS_FEEDBACK_CMD_TYPE_NONE = 0,
+	IPTS_FEEDBACK_CMD_TYPE_SOFT_RESET = 1,
+	IPTS_FEEDBACK_CMD_TYPE_GOTO_ARMED = 2,
+	IPTS_FEEDBACK_CMD_TYPE_GOTO_SENSING = 3,
+	IPTS_FEEDBACK_CMD_TYPE_GOTO_SLEEP = 4,
+	IPTS_FEEDBACK_CMD_TYPE_GOTO_DOZE = 5,
+	IPTS_FEEDBACK_CMD_TYPE_HARD_RESET = 6,
+};
+
+/**
+ * enum ipts_feedback_data_type - Defines what data a feedback buffer contains.
+ * @IPTS_FEEDBACK_DATA_TYPE_VENDOR:        The buffer contains vendor specific feedback.
+ * @IPTS_FEEDBACK_DATA_TYPE_SET_FEATURES:  The buffer contains a HID set features report.
+ * @IPTS_FEEDBACK_DATA_TYPE_GET_FEATURES:  The buffer contains a HID get features report.
+ * @IPTS_FEEDBACK_DATA_TYPE_OUTPUT_REPORT: The buffer contains a HID output report.
+ * @IPTS_FEEDBACK_DATA_TYPE_STORE_DATA:    The buffer contains calibration data for the sensor.
+ */
+enum ipts_feedback_data_type {
+	IPTS_FEEDBACK_DATA_TYPE_VENDOR = 0,
+	IPTS_FEEDBACK_DATA_TYPE_SET_FEATURES = 1,
+	IPTS_FEEDBACK_DATA_TYPE_GET_FEATURES = 2,
+	IPTS_FEEDBACK_DATA_TYPE_OUTPUT_REPORT = 3,
+	IPTS_FEEDBACK_DATA_TYPE_STORE_DATA = 4,
+};
+
+/**
+ * struct ipts_feedback_header - Header that is prefixed to the data in a feedback buffer.
+ * @cmd_type:   A command that should be executed on the sensor.
+ * @size:       The size of the payload to be written.
+ * @buffer:     The ID of the buffer that contains this feedback data.
+ * @protocol:   The protocol version of the EDS.
+ * @data_type:  The type of data that the buffer contains.
+ * @spi_offset: The offset at which to write the payload data to the sensor.
+ * @payload:    Payload for the feedback command, or 0 if no payload is sent.
+ */
+struct ipts_feedback_header {
+	enum ipts_feedback_cmd_type cmd_type;
+	u32 size;
+	u32 buffer;
+	u32 protocol;
+	enum ipts_feedback_data_type data_type;
+	u32 spi_offset;
+	u8 reserved[40];
+	u8 payload[];
+} __packed;
+
+static_assert(sizeof(struct ipts_feedback_header) == 64);
+
+/**
+ * enum ipts_data_type - Defines what type of data a buffer contains.
+ * @IPTS_DATA_TYPE_FRAME:        Raw data frame.
+ * @IPTS_DATA_TYPE_ERROR:        Error data.
+ * @IPTS_DATA_TYPE_VENDOR:       Vendor specific data.
+ * @IPTS_DATA_TYPE_HID:          A HID report.
+ * @IPTS_DATA_TYPE_GET_FEATURES: The response to a GET_FEATURES HID2ME command.
+ */
+enum ipts_data_type {
+	IPTS_DATA_TYPE_FRAME = 0x00,
+	IPTS_DATA_TYPE_ERROR = 0x01,
+	IPTS_DATA_TYPE_VENDOR = 0x02,
+	IPTS_DATA_TYPE_HID = 0x03,
+	IPTS_DATA_TYPE_GET_FEATURES = 0x04,
+	IPTS_DATA_TYPE_DESCRIPTOR = 0x05,
+};
+
+/**
+ * struct ipts_data_header - Header that is prefixed to the data in a data buffer.
+ * @type: What data the buffer contains.
+ * @size: How much data the buffer contains.
+ * @buffer: Which buffer the data is in.
+ */
+struct ipts_data_header {
+	enum ipts_data_type type;
+	u32 size;
+	u32 buffer;
+	u8 reserved[52];
+	u8 data[];
+} __packed;
+
+static_assert(sizeof(struct ipts_data_header) == 64);
+
+#endif /* IPTS_SPEC_DATA_H */
diff --git a/drivers/hid/ipts/spec-device.h b/drivers/hid/ipts/spec-device.h
new file mode 100644
index 000000000000..93f673d981f7
--- /dev/null
+++ b/drivers/hid/ipts/spec-device.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_SPEC_DEVICE_H
+#define IPTS_SPEC_DEVICE_H
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+/*
+ * The amount of buffers that IPTS can use for data transfer.
+ */
+#define IPTS_BUFFERS 16
+
+/*
+ * The buffer ID that is used for HID2ME feedback
+ */
+#define IPTS_HID2ME_BUFFER IPTS_BUFFERS
+
+/**
+ * enum ipts_command - Commands that can be sent to the IPTS hardware.
+ * @IPTS_CMD_GET_DEVICE_INFO:  Retrieves vendor information from the device.
+ * @IPTS_CMD_SET_MODE:         Changes the mode that the device will operate in.
+ * @IPTS_CMD_SET_MEM_WINDOW:   Configures memory buffers for passing data between device and driver.
+ * @IPTS_CMD_QUIESCE_IO:       Stops the data flow from the device to the driver.
+ * @IPTS_CMD_READY_FOR_DATA:   Informs the device that the driver is ready to receive data.
+ * @IPTS_CMD_FEEDBACK:         Informs the device that a buffer was processed and can be refilled.
+ * @IPTS_CMD_CLEAR_MEM_WINDOW: Stops the data flow and clears the buffer addresses on the device.
+ * @IPTS_CMD_RESET_SENSOR:     Resets the sensor to its default state.
+ * @IPTS_CMD_GET_DESCRIPTOR:   Retrieves the HID descriptor of the device.
+ */
+enum ipts_command_code {
+	IPTS_CMD_GET_DEVICE_INFO = 0x01,
+	IPTS_CMD_SET_MODE = 0x02,
+	IPTS_CMD_SET_MEM_WINDOW = 0x03,
+	IPTS_CMD_QUIESCE_IO = 0x04,
+	IPTS_CMD_READY_FOR_DATA = 0x05,
+	IPTS_CMD_FEEDBACK = 0x06,
+	IPTS_CMD_CLEAR_MEM_WINDOW = 0x07,
+	IPTS_CMD_RESET_SENSOR = 0x0B,
+	IPTS_CMD_GET_DESCRIPTOR = 0x0F,
+};
+
+/**
+ * enum ipts_status - Possible status codes returned by the IPTS device.
+ * @IPTS_STATUS_SUCCESS:                 Operation completed successfully.
+ * @IPTS_STATUS_INVALID_PARAMS:          Command contained an invalid payload.
+ * @IPTS_STATUS_ACCESS_DENIED:           ME could not validate a buffer address.
+ * @IPTS_STATUS_CMD_SIZE_ERROR:          Command contains an invalid payload.
+ * @IPTS_STATUS_NOT_READY:               Buffer addresses have not been set.
+ * @IPTS_STATUS_REQUEST_OUTSTANDING:     There is an outstanding command of the same type.
+ * @IPTS_STATUS_NO_SENSOR_FOUND:         No sensor could be found.
+ * @IPTS_STATUS_OUT_OF_MEMORY:           Not enough free memory for requested operation.
+ * @IPTS_STATUS_INTERNAL_ERROR:          An unexpected error occurred.
+ * @IPTS_STATUS_SENSOR_DISABLED:         The sensor has been disabled and must be reinitialized.
+ * @IPTS_STATUS_COMPAT_CHECK_FAIL:       Compatibility revision check between sensor and ME failed.
+ *                                       The host can ignore this error and attempt to continue.
+ * @IPTS_STATUS_SENSOR_EXPECTED_RESET:   The sensor went through a reset initiated by the driver.
+ * @IPTS_STATUS_SENSOR_UNEXPECTED_RESET: The sensor went through an unexpected reset.
+ * @IPTS_STATUS_RESET_FAILED:            Requested sensor reset failed to complete.
+ * @IPTS_STATUS_TIMEOUT:                 The operation timed out.
+ * @IPTS_STATUS_TEST_MODE_FAIL:          Test mode pattern did not match expected values.
+ * @IPTS_STATUS_SENSOR_FAIL_FATAL:       The sensor reported an error during reset sequence.
+ *                                       Further progress is not possible.
+ * @IPTS_STATUS_SENSOR_FAIL_NONFATAL:    The sensor reported an error during reset sequence.
+ *                                       The driver can attempt to continue.
+ * @IPTS_STATUS_INVALID_DEVICE_CAPS:     The device reported invalid capabilities.
+ * @IPTS_STATUS_QUIESCE_IO_IN_PROGRESS:  Command cannot be completed until Quiesce IO is done.
+ */
+enum ipts_status {
+	IPTS_STATUS_SUCCESS = 0x00,
+	IPTS_STATUS_INVALID_PARAMS = 0x01,
+	IPTS_STATUS_ACCESS_DENIED = 0x02,
+	IPTS_STATUS_CMD_SIZE_ERROR = 0x03,
+	IPTS_STATUS_NOT_READY = 0x04,
+	IPTS_STATUS_REQUEST_OUTSTANDING = 0x05,
+	IPTS_STATUS_NO_SENSOR_FOUND = 0x06,
+	IPTS_STATUS_OUT_OF_MEMORY = 0x07,
+	IPTS_STATUS_INTERNAL_ERROR = 0x08,
+	IPTS_STATUS_SENSOR_DISABLED = 0x09,
+	IPTS_STATUS_COMPAT_CHECK_FAIL = 0x0A,
+	IPTS_STATUS_SENSOR_EXPECTED_RESET = 0x0B,
+	IPTS_STATUS_SENSOR_UNEXPECTED_RESET = 0x0C,
+	IPTS_STATUS_RESET_FAILED = 0x0D,
+	IPTS_STATUS_TIMEOUT = 0x0E,
+	IPTS_STATUS_TEST_MODE_FAIL = 0x0F,
+	IPTS_STATUS_SENSOR_FAIL_FATAL = 0x10,
+	IPTS_STATUS_SENSOR_FAIL_NONFATAL = 0x11,
+	IPTS_STATUS_INVALID_DEVICE_CAPS = 0x12,
+	IPTS_STATUS_QUIESCE_IO_IN_PROGRESS = 0x13,
+};
+
+/**
+ * struct ipts_command - Message that is sent to the device for calling a command.
+ * @cmd:     The command that will be called.
+ * @payload: Payload containing parameters for the called command.
+ */
+struct ipts_command {
+	enum ipts_command_code cmd;
+	u8 payload[320];
+} __packed;
+
+static_assert(sizeof(struct ipts_command) == 324);
+
+/**
+ * enum ipts_mode - Configures what data the device produces and how its sent.
+ * @IPTS_MODE_EVENT:    The device will send an event once a buffer was filled.
+ *                      Older devices will return singletouch data in this mode.
+ * @IPTS_MODE_DOORBELL: The device will notify the driver by incrementing the doorbell value.
+ *                      Older devices will return multitouch data in this mode.
+ */
+enum ipts_mode {
+	IPTS_MODE_EVENT = 0x00,
+	IPTS_MODE_DOORBELL = 0x01,
+};
+
+/**
+ * struct ipts_set_mode - Payload for the SET_MODE command.
+ * @mode: Changes the mode that IPTS will operate in.
+ */
+struct ipts_set_mode {
+	enum ipts_mode mode;
+	u8 reserved[12];
+} __packed;
+
+static_assert(sizeof(struct ipts_set_mode) == 16);
+
+#define IPTS_WORKQUEUE_SIZE	 8192
+#define IPTS_WORKQUEUE_ITEM_SIZE 16
+
+/**
+ * struct ipts_mem_window - Payload for the SET_MEM_WINDOW command.
+ * @data_addr_lower:      Lower 32 bits of the data buffer addresses.
+ * @data_addr_upper:      Upper 32 bits of the data buffer addresses.
+ * @workqueue_addr_lower: Lower 32 bits of the workqueue buffer address.
+ * @workqueue_addr_upper: Upper 32 bits of the workqueue buffer address.
+ * @doorbell_addr_lower:  Lower 32 bits of the doorbell buffer address.
+ * @doorbell_addr_upper:  Upper 32 bits of the doorbell buffer address.
+ * @feedbackaddr_lower:   Lower 32 bits of the feedback buffer addresses.
+ * @feedbackaddr_upper:   Upper 32 bits of the feedback buffer addresses.
+ * @hid2me_addr_lower:    Lower 32 bits of the hid2me buffer address.
+ * @hid2me_addr_upper:    Upper 32 bits of the hid2me buffer address.
+ * @hid2me_size:          Size of the hid2me feedback buffer.
+ * @workqueue_item_size:  Magic value. Must be 16.
+ * @workqueue_size:       Magic value. Must be 8192.
+ *
+ * The workqueue related items in this struct are required for using
+ * GuC submission with binary processing firmware. Since this driver does
+ * not use GuC submission and instead exports raw data to userspace, these
+ * items are not actually used, but they need to be allocated and passed
+ * to the device, otherwise initialization will fail.
+ */
+struct ipts_mem_window {
+	u32 data_addr_lower[IPTS_BUFFERS];
+	u32 data_addr_upper[IPTS_BUFFERS];
+	u32 workqueue_addr_lower;
+	u32 workqueue_addr_upper;
+	u32 doorbell_addr_lower;
+	u32 doorbell_addr_upper;
+	u32 feedback_addr_lower[IPTS_BUFFERS];
+	u32 feedback_addr_upper[IPTS_BUFFERS];
+	u32 hid2me_addr_lower;
+	u32 hid2me_addr_upper;
+	u32 hid2me_size;
+	u8 reserved1;
+	u8 workqueue_item_size;
+	u16 workqueue_size;
+	u8 reserved[32];
+} __packed;
+
+static_assert(sizeof(struct ipts_mem_window) == 320);
+
+/**
+ * struct ipts_quiesce_io - Payload for the QUIESCE_IO command.
+ */
+struct ipts_quiesce_io {
+	u8 reserved[12];
+} __packed;
+
+static_assert(sizeof(struct ipts_quiesce_io) == 12);
+
+/**
+ * struct ipts_feedback - Payload for the FEEDBACK command.
+ * @buffer: The buffer that the device should refill.
+ */
+struct ipts_feedback {
+	u32 buffer;
+	u8 reserved[12];
+} __packed;
+
+static_assert(sizeof(struct ipts_feedback) == 16);
+
+/**
+ * enum ipts_reset_type - Possible ways of resetting the device.
+ * @IPTS_RESET_TYPE_HARD: Perform hardware reset using GPIO pin.
+ * @IPTS_RESET_TYPE_SOFT: Perform software reset using SPI command.
+ */
+enum ipts_reset_type {
+	IPTS_RESET_TYPE_HARD = 0x00,
+	IPTS_RESET_TYPE_SOFT = 0x01,
+};
+
+/**
+ * struct ipts_reset - Payload for the RESET_SENSOR command.
+ * @type: How the device should get reset.
+ */
+struct ipts_reset_sensor {
+	enum ipts_reset_type type;
+	u8 reserved[4];
+} __packed;
+
+static_assert(sizeof(struct ipts_reset_sensor) == 8);
+
+/**
+ * struct ipts_get_descriptor - Payload for the GET_DESCRIPTOR command.
+ * @addr_lower: The lower 32 bits of the descriptor buffer address.
+ * @addr_upper: The upper 32 bits of the descriptor buffer address.
+ * @magic:      A magic value. Must be 8.
+ */
+struct ipts_get_descriptor {
+	u32 addr_lower;
+	u32 addr_upper;
+	u32 magic;
+	u8 reserved[12];
+} __packed;
+
+static_assert(sizeof(struct ipts_get_descriptor) == 24);
+
+/*
+ * The type of a response is indicated by a
+ * command code, with the most significant bit flipped to 1.
+ */
+#define IPTS_RSP_BIT BIT(31)
+
+/**
+ * struct ipts_response - Data returned from the device in response to a command.
+ * @cmd:     The command that this response answers (IPTS_RSP_BIT will be 1).
+ * @status:  The return code of the command.
+ * @payload: The data that was produced by the command.
+ */
+struct ipts_response {
+	enum ipts_command_code cmd;
+	enum ipts_status status;
+	u8 payload[80];
+} __packed;
+
+static_assert(sizeof(struct ipts_response) == 88);
+
+/**
+ * struct ipts_device_info - Vendor information of the IPTS device.
+ * @vendor:        Vendor ID of this device.
+ * @product:       Product ID of this device.
+ * @hw_version:    Hardware revision of this device.
+ * @fw_version:    Firmware revision of this device.
+ * @data_size:     Requested size for a data buffer.
+ * @feedback_size: Requested size for a feedback buffer.
+ * @mode:          Mode that the device currently operates in.
+ * @max_contacts:  Maximum amount of concurrent touches the sensor can process.
+ */
+struct ipts_device_info {
+	u16 vendor;
+	u16 product;
+	u32 hw_version;
+	u32 fw_version;
+	u32 data_size;
+	u32 feedback_size;
+	enum ipts_mode mode;
+	u8 max_contacts;
+	u8 reserved1[3];
+	u8 sensor_min_eds;
+	u8 sensor_maj_eds;
+	u8 me_min_eds;
+	u8 me_maj_eds;
+	u8 intf_eds;
+	u8 reserved2[11];
+} __packed;
+
+static_assert(sizeof(struct ipts_device_info) == 44);
+
+#endif /* IPTS_SPEC_DEVICE_H */
diff --git a/drivers/hid/ipts/spec-hid.h b/drivers/hid/ipts/spec-hid.h
new file mode 100644
index 000000000000..ea70f29ff00c
--- /dev/null
+++ b/drivers/hid/ipts/spec-hid.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2020-2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_SPEC_HID_H
+#define IPTS_SPEC_HID_H
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+/*
+ * Made-up type for passing raw IPTS data in a HID report.
+ */
+#define IPTS_HID_FRAME_TYPE_RAW 0xEE
+
+/**
+ * struct ipts_hid_frame - Header that is prefixed to raw IPTS data wrapped in a HID report.
+ * @size: Size of the data inside the report, including this header.
+ * @type: What type of data does this report contain.
+ */
+struct ipts_hid_header {
+	u32 size;
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 data[];
+} __packed;
+
+static_assert(sizeof(struct ipts_hid_header) == 7);
+
+#endif /* IPTS_SPEC_HID_H */
diff --git a/drivers/hid/ipts/thread.c b/drivers/hid/ipts/thread.c
new file mode 100644
index 000000000000..8b46f775c107
--- /dev/null
+++ b/drivers/hid/ipts/thread.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+
+#include "thread.h"
+
+bool ipts_thread_should_stop(struct ipts_thread *thread)
+{
+	if (!thread)
+		return false;
+
+	return READ_ONCE(thread->should_stop);
+}
+
+static int ipts_thread_runner(void *data)
+{
+	int ret = 0;
+	struct ipts_thread *thread = data;
+
+	if (!thread)
+		return -EFAULT;
+
+	if (!thread->threadfn)
+		return -EFAULT;
+
+	ret = thread->threadfn(thread);
+	complete_all(&thread->done);
+
+	return ret;
+}
+
+int ipts_thread_start(struct ipts_thread *thread, int (*threadfn)(struct ipts_thread *thread),
+		      void *data, const char *name)
+{
+	if (!thread)
+		return -EFAULT;
+
+	if (!threadfn)
+		return -EFAULT;
+
+	init_completion(&thread->done);
+
+	thread->data = data;
+	thread->should_stop = false;
+	thread->threadfn = threadfn;
+
+	thread->thread = kthread_run(ipts_thread_runner, thread, name);
+	return PTR_ERR_OR_ZERO(thread->thread);
+}
+
+int ipts_thread_stop(struct ipts_thread *thread)
+{
+	int ret = 0;
+
+	if (!thread)
+		return -EFAULT;
+
+	if (!thread->thread)
+		return 0;
+
+	WRITE_ONCE(thread->should_stop, true);
+
+	/*
+	 * Make sure that the write has gone through before waiting.
+	 */
+	wmb();
+
+	wait_for_completion(&thread->done);
+	ret = kthread_stop(thread->thread);
+
+	thread->thread = NULL;
+	thread->data = NULL;
+	thread->threadfn = NULL;
+
+	return ret;
+}
diff --git a/drivers/hid/ipts/thread.h b/drivers/hid/ipts/thread.h
new file mode 100644
index 000000000000..a314843599fc
--- /dev/null
+++ b/drivers/hid/ipts/thread.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2023 Dorian Stoll
+ *
+ * Linux driver for Intel Precise Touch & Stylus
+ */
+
+#ifndef IPTS_THREAD_H
+#define IPTS_THREAD_H
+
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+
+/*
+ * This wrapper over kthread is necessary, because calling kthread_stop makes it impossible
+ * to issue MEI commands from that thread while it shuts itself down. By using a custom
+ * boolean variable and a completion object, we can call kthread_stop only when the thread
+ * already finished all of its work and has returned.
+ */
+struct ipts_thread {
+	struct task_struct *thread;
+
+	bool should_stop;
+	struct completion done;
+
+	void *data;
+	int (*threadfn)(struct ipts_thread *thread);
+};
+
+/*
+ * ipts_thread_should_stop() - Returns true if the thread is asked to terminate.
+ * @thread: The current thread.
+ *
+ * Returns: true if the thread should stop, false if not.
+ */
+bool ipts_thread_should_stop(struct ipts_thread *thread);
+
+/*
+ * ipts_thread_start() - Starts an IPTS thread.
+ * @thread: The thread to initialize and start.
+ * @threadfn: The function to execute.
+ * @data: An argument that will be passed to threadfn.
+ * @name: The name of the new thread.
+ *
+ * Returns: 0 on success, <0 on error.
+ */
+int ipts_thread_start(struct ipts_thread *thread, int (*threadfn)(struct ipts_thread *thread),
+		      void *data, const char name[]);
+
+/*
+ * ipts_thread_stop() - Asks the thread to terminate and waits until it has finished.
+ * @thread: The thread that should stop.
+ *
+ * Returns: The return value of the thread function.
+ */
+int ipts_thread_stop(struct ipts_thread *thread);
+
+#endif /* IPTS_THREAD_H */
diff --git a/drivers/hid/ithc/Kbuild b/drivers/hid/ithc/Kbuild
new file mode 100644
index 000000000000..aea83f2ac07b
--- /dev/null
+++ b/drivers/hid/ithc/Kbuild
@@ -0,0 +1,6 @@
+obj-$(CONFIG_HID_ITHC) := ithc.o
+
+ithc-objs := ithc-main.o ithc-regs.o ithc-dma.o ithc-debug.o
+
+ccflags-y := -std=gnu11 -Wno-declaration-after-statement
+
diff --git a/drivers/hid/ithc/Kconfig b/drivers/hid/ithc/Kconfig
new file mode 100644
index 000000000000..ede713023609
--- /dev/null
+++ b/drivers/hid/ithc/Kconfig
@@ -0,0 +1,12 @@
+config HID_ITHC
+	tristate "Intel Touch Host Controller"
+	depends on PCI
+	depends on HID
+	help
+	  Say Y here if your system has a touchscreen using Intels
+	  Touch Host Controller (ITHC / IPTS) technology.
+
+	  If unsure say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ithc.
diff --git a/drivers/hid/ithc/ithc-debug.c b/drivers/hid/ithc/ithc-debug.c
new file mode 100644
index 000000000000..57bf125c45bd
--- /dev/null
+++ b/drivers/hid/ithc/ithc-debug.c
@@ -0,0 +1,96 @@
+#include "ithc.h"
+
+void ithc_log_regs(struct ithc *ithc) {
+	if (!ithc->prev_regs) return;
+	u32 __iomem *cur = (__iomem void*)ithc->regs;
+	u32 *prev = (void*)ithc->prev_regs;
+	for (int i = 1024; i < sizeof *ithc->regs / 4; i++) {
+		u32 x = readl(cur + i);
+		if (x != prev[i]) {
+			pci_info(ithc->pci, "reg %04x: %08x -> %08x\n", i * 4, prev[i], x);
+			prev[i] = x;
+		}
+	}
+}
+
+static ssize_t ithc_debugfs_cmd_write(struct file *f, const char __user *buf, size_t len, loff_t *offset) {
+	struct ithc *ithc = file_inode(f)->i_private;
+	char cmd[256];
+	if (!ithc || !ithc->pci) return -ENODEV;
+	if (!len) return -EINVAL;
+	if (len >= sizeof cmd) return -EINVAL;
+	if (copy_from_user(cmd, buf, len)) return -EFAULT;
+	cmd[len] = 0;
+	if (cmd[len-1] == '\n') cmd[len-1] = 0;
+	pci_info(ithc->pci, "debug command: %s\n", cmd);
+	u32 n = 0;
+	const char *s = cmd + 1;
+	u32 a[32];
+	while (*s && *s != '\n') {
+		if (n >= ARRAY_SIZE(a)) return -EINVAL;
+		if (*s++ != ' ') return -EINVAL;
+		char *e;
+		a[n++] = simple_strtoul(s, &e, 0);
+		if (e == s) return -EINVAL;
+		s = e;
+	}
+	ithc_log_regs(ithc);
+	switch(cmd[0]) {
+	case 'x': // reset
+		ithc_reset(ithc);
+		break;
+	case 'w': // write register: offset mask value
+		if (n != 3 || (a[0] & 3)) return -EINVAL;
+		pci_info(ithc->pci, "debug write 0x%04x = 0x%08x (mask 0x%08x)\n", a[0], a[2], a[1]);
+		bitsl(((__iomem u32 *)ithc->regs) + a[0] / 4, a[1], a[2]);
+		break;
+	case 'r': // read register: offset
+		if (n != 1 || (a[0] & 3)) return -EINVAL;
+		pci_info(ithc->pci, "debug read 0x%04x = 0x%08x\n", a[0], readl(((__iomem u32 *)ithc->regs) + a[0] / 4));
+		break;
+	case 's': // spi command: cmd offset len data...
+		// read config: s 4 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+		// set touch cfg: s 6 12 4 XX
+		if (n < 3 || a[2] > (n - 3) * 4) return -EINVAL;
+		pci_info(ithc->pci, "debug spi command %u with %u bytes of data\n", a[0], a[2]);
+		if (!CHECK(ithc_spi_command, ithc, a[0], a[1], a[2], a + 3))
+			for (u32 i = 0; i < (a[2] + 3) / 4; i++) pci_info(ithc->pci, "resp %u = 0x%08x\n", i, a[3+i]);
+		break;
+	case 'd': // dma command: cmd len data...
+		// get report descriptor: d 7 8 0 0
+		// enable multitouch: d 3 2 0x0105
+		if (n < 2 || a[1] > (n - 2) * 4) return -EINVAL;
+		pci_info(ithc->pci, "debug dma command %u with %u bytes of data\n", a[0], a[1]);
+		if (ithc_dma_tx(ithc, a[0], a[1], a + 2)) pci_err(ithc->pci, "dma tx failed\n");
+		break;
+	default:
+		return -EINVAL;
+	}
+	ithc_log_regs(ithc);
+	return len;
+}
+
+static const struct file_operations ithc_debugfops_cmd = {
+	.owner = THIS_MODULE,
+	.write = ithc_debugfs_cmd_write,
+};
+
+static void ithc_debugfs_devres_release(struct device *dev, void *res) {
+	struct dentry **dbgm = res;
+	if (*dbgm) debugfs_remove_recursive(*dbgm);
+}
+
+int ithc_debug_init(struct ithc *ithc) {
+	struct dentry **dbgm = devres_alloc(ithc_debugfs_devres_release, sizeof *dbgm, GFP_KERNEL);
+	if (!dbgm) return -ENOMEM;
+	devres_add(&ithc->pci->dev, dbgm);
+	struct dentry *dbg = debugfs_create_dir(DEVNAME, NULL);
+	if (IS_ERR(dbg)) return PTR_ERR(dbg);
+	*dbgm = dbg;
+
+	struct dentry *cmd = debugfs_create_file("cmd", 0220, dbg, ithc, &ithc_debugfops_cmd);
+	if (IS_ERR(cmd)) return PTR_ERR(cmd);
+
+	return 0;
+}
+
diff --git a/drivers/hid/ithc/ithc-dma.c b/drivers/hid/ithc/ithc-dma.c
new file mode 100644
index 000000000000..7e89b3496918
--- /dev/null
+++ b/drivers/hid/ithc/ithc-dma.c
@@ -0,0 +1,258 @@
+#include "ithc.h"
+
+static int ithc_dma_prd_alloc(struct ithc *ithc, struct ithc_dma_prd_buffer *p, unsigned num_buffers, unsigned num_pages, enum dma_data_direction dir) {
+	p->num_pages = num_pages;
+	p->dir = dir;
+	p->size = round_up(num_buffers * num_pages * sizeof(struct ithc_phys_region_desc), PAGE_SIZE);
+	p->addr = dmam_alloc_coherent(&ithc->pci->dev, p->size, &p->dma_addr, GFP_KERNEL);
+	if (!p->addr) return -ENOMEM;
+	if (p->dma_addr & (PAGE_SIZE - 1)) return -EFAULT;
+	return 0;
+}
+
+struct ithc_sg_table {
+	void *addr;
+	struct sg_table sgt;
+	enum dma_data_direction dir;
+};
+static void ithc_dma_sgtable_free(struct sg_table *sgt) {
+	struct scatterlist *sg;
+	int i;
+	for_each_sgtable_sg(sgt, sg, i) {
+		struct page *p = sg_page(sg);
+		if (p) __free_page(p);
+	}
+	sg_free_table(sgt);
+}
+static void ithc_dma_data_devres_release(struct device *dev, void *res) {
+	struct ithc_sg_table *sgt = res;
+	if (sgt->addr) vunmap(sgt->addr);
+	dma_unmap_sgtable(dev, &sgt->sgt, sgt->dir, 0);
+	ithc_dma_sgtable_free(&sgt->sgt);
+}
+
+static int ithc_dma_data_alloc(struct ithc* ithc, struct ithc_dma_prd_buffer *prds, struct ithc_dma_data_buffer *b) {
+	// We don't use dma_alloc_coherent for data buffers, because they don't have to be contiguous (we can use one PRD per page) or coherent (they are unidirectional).
+	// Instead we use an sg_table of individually allocated pages (5.13 has dma_alloc_noncontiguous for this, but we'd like to support 5.10 for now).
+	struct page *pages[16];
+	if (prds->num_pages == 0 || prds->num_pages > ARRAY_SIZE(pages)) return -EINVAL;
+	b->active_idx = -1;
+	struct ithc_sg_table *sgt = devres_alloc(ithc_dma_data_devres_release, sizeof *sgt, GFP_KERNEL);
+	if (!sgt) return -ENOMEM;
+	sgt->dir = prds->dir;
+	if (!sg_alloc_table(&sgt->sgt, prds->num_pages, GFP_KERNEL)) {
+		struct scatterlist *sg;
+		int i;
+		bool ok = true;
+		for_each_sgtable_sg(&sgt->sgt, sg, i) {
+			struct page *p = pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); // don't need __GFP_DMA for PCI DMA
+			if (!p) { ok = false; break; }
+			sg_set_page(sg, p, PAGE_SIZE, 0);
+		}
+		if (ok && !dma_map_sgtable(&ithc->pci->dev, &sgt->sgt, prds->dir, 0)) {
+			devres_add(&ithc->pci->dev, sgt);
+			b->sgt = &sgt->sgt;
+			b->addr = sgt->addr = vmap(pages, prds->num_pages, 0, PAGE_KERNEL);
+			if (!b->addr) return -ENOMEM;
+			return 0;
+		}
+		ithc_dma_sgtable_free(&sgt->sgt);
+	}
+	devres_free(sgt);
+	return -ENOMEM;
+}
+
+static int ithc_dma_data_buffer_put(struct ithc *ithc, struct ithc_dma_prd_buffer *prds, struct ithc_dma_data_buffer *b, unsigned idx) {
+	struct ithc_phys_region_desc *prd = prds->addr;
+	prd += idx * prds->num_pages;
+	if (b->active_idx >= 0) { pci_err(ithc->pci, "buffer already active\n"); return -EINVAL; }
+	b->active_idx = idx;
+	if (prds->dir == DMA_TO_DEVICE) {
+		if (b->data_size > PAGE_SIZE) return -EINVAL;
+		prd->addr = sg_dma_address(b->sgt->sgl) >> 10;
+		prd->size = b->data_size | PRD_FLAG_END;
+		flush_kernel_vmap_range(b->addr, b->data_size);
+	} else if (prds->dir == DMA_FROM_DEVICE) {
+		struct scatterlist *sg;
+		int i;
+		for_each_sgtable_dma_sg(b->sgt, sg, i) {
+			prd->addr = sg_dma_address(sg) >> 10;
+			prd->size = sg_dma_len(sg);
+			prd++;
+		}
+		prd[-1].size |= PRD_FLAG_END;
+	}
+	dma_wmb(); // for the prds
+	dma_sync_sgtable_for_device(&ithc->pci->dev, b->sgt, prds->dir);
+	return 0;
+}
+
+static int ithc_dma_data_buffer_get(struct ithc *ithc, struct ithc_dma_prd_buffer *prds, struct ithc_dma_data_buffer *b, unsigned idx) {
+	struct ithc_phys_region_desc *prd = prds->addr;
+	prd += idx * prds->num_pages;
+	if (b->active_idx != idx) { pci_err(ithc->pci, "wrong buffer index\n"); return -EINVAL; }
+	b->active_idx = -1;
+	if (prds->dir == DMA_FROM_DEVICE) {
+		dma_rmb(); // for the prds
+		b->data_size = 0;
+		struct scatterlist *sg;
+		int i;
+		for_each_sgtable_dma_sg(b->sgt, sg, i) {
+			unsigned size = prd->size;
+			b->data_size += size & PRD_SIZE_MASK;
+			if (size & PRD_FLAG_END) break;
+			if ((size & PRD_SIZE_MASK) != sg_dma_len(sg)) { pci_err(ithc->pci, "truncated prd\n"); break; }
+			prd++;
+		}
+		invalidate_kernel_vmap_range(b->addr, b->data_size);
+	}
+	dma_sync_sgtable_for_cpu(&ithc->pci->dev, b->sgt, prds->dir);
+	return 0;
+}
+
+int ithc_dma_rx_init(struct ithc *ithc, u8 channel, const char *devname) {
+	struct ithc_dma_rx *rx = &ithc->dma_rx[channel];
+	mutex_init(&rx->mutex);
+	u32 buf_size = DEVCFG_DMA_RX_SIZE(ithc->config.dma_buf_sizes);
+	unsigned num_pages = (buf_size + PAGE_SIZE - 1) / PAGE_SIZE;
+	pci_dbg(ithc->pci, "allocating rx buffers: num = %u, size = %u, pages = %u\n", NUM_RX_BUF, buf_size, num_pages);
+	CHECK_RET(ithc_dma_prd_alloc, ithc, &rx->prds, NUM_RX_BUF, num_pages, DMA_FROM_DEVICE);
+	for (unsigned i = 0; i < NUM_RX_BUF; i++)
+		CHECK_RET(ithc_dma_data_alloc, ithc, &rx->prds, &rx->bufs[i]);
+	writeb(DMA_RX_CONTROL2_RESET, &ithc->regs->dma_rx[channel].control2);
+	lo_hi_writeq(rx->prds.dma_addr, &ithc->regs->dma_rx[channel].addr);
+	writeb(NUM_RX_BUF - 1, &ithc->regs->dma_rx[channel].num_bufs);
+	writeb(num_pages - 1, &ithc->regs->dma_rx[channel].num_prds);
+	u8 head = readb(&ithc->regs->dma_rx[channel].head);
+	if (head) { pci_err(ithc->pci, "head is nonzero (%u)\n", head); return -EIO; }
+	for (unsigned i = 0; i < NUM_RX_BUF; i++)
+		CHECK_RET(ithc_dma_data_buffer_put, ithc, &rx->prds, &rx->bufs[i], i);
+	writeb(head ^ DMA_RX_WRAP_FLAG, &ithc->regs->dma_rx[channel].tail);
+	return 0;
+}
+void ithc_dma_rx_enable(struct ithc *ithc, u8 channel) {
+	bitsb_set(&ithc->regs->dma_rx[channel].control, DMA_RX_CONTROL_ENABLE | DMA_RX_CONTROL_IRQ_ERROR | DMA_RX_CONTROL_IRQ_DATA);
+	CHECK(waitl, ithc, &ithc->regs->dma_rx[1].status, DMA_RX_STATUS_ENABLED, DMA_RX_STATUS_ENABLED);
+}
+
+int ithc_dma_tx_init(struct ithc *ithc) {
+	struct ithc_dma_tx *tx = &ithc->dma_tx;
+	mutex_init(&tx->mutex);
+	tx->max_size = DEVCFG_DMA_TX_SIZE(ithc->config.dma_buf_sizes);
+	unsigned num_pages = (tx->max_size + PAGE_SIZE - 1) / PAGE_SIZE;
+	pci_dbg(ithc->pci, "allocating tx buffers: size = %u, pages = %u\n", tx->max_size, num_pages);
+	CHECK_RET(ithc_dma_prd_alloc, ithc, &tx->prds, 1, num_pages, DMA_TO_DEVICE);
+	CHECK_RET(ithc_dma_data_alloc, ithc, &tx->prds, &tx->buf);
+	lo_hi_writeq(tx->prds.dma_addr, &ithc->regs->dma_tx.addr);
+	writeb(num_pages - 1, &ithc->regs->dma_tx.num_prds);
+	CHECK_RET(ithc_dma_data_buffer_put, ithc, &ithc->dma_tx.prds, &ithc->dma_tx.buf, 0);
+	return 0;
+}
+
+static int ithc_dma_rx_process_buf(struct ithc *ithc, struct ithc_dma_data_buffer *data, u8 channel, u8 buf) {
+	if (buf >= NUM_RX_BUF) {
+		pci_err(ithc->pci, "invalid dma ringbuffer index\n");
+		return -EINVAL;
+	}
+	ithc_set_active(ithc);
+	u32 len = data->data_size;
+	struct ithc_dma_rx_header *hdr = data->addr;
+	u8 *hiddata = (void *)(hdr + 1);
+	if (len >= sizeof *hdr && hdr->code == DMA_RX_CODE_RESET) {
+		CHECK(ithc_reset, ithc);
+	} else if (len < sizeof *hdr || len != sizeof *hdr + hdr->data_size) {
+		if (hdr->code == DMA_RX_CODE_INPUT_REPORT) {
+			// When the CPU enters a low power state during DMA, we can get truncated messages.
+			// Typically this will be a single touch HID report that is only 1 byte, or a multitouch report that is 257 bytes.
+			// See also ithc_set_active().
+		} else {
+			pci_err(ithc->pci, "invalid dma rx data! channel %u, buffer %u, size %u, code %u, data size %u\n", channel, buf, len, hdr->code, hdr->data_size);
+			print_hex_dump_debug(DEVNAME " data: ", DUMP_PREFIX_OFFSET, 32, 1, hdr, min(len, 0x400u), 0);
+		}
+	} else if (hdr->code == DMA_RX_CODE_REPORT_DESCRIPTOR && hdr->data_size > 8) {
+		CHECK(hid_parse_report, ithc->hid, hiddata + 8, hdr->data_size - 8);
+		WRITE_ONCE(ithc->hid_parse_done, true);
+		wake_up(&ithc->wait_hid_parse);
+	} else if (hdr->code == DMA_RX_CODE_INPUT_REPORT) {
+		CHECK(hid_input_report, ithc->hid, HID_INPUT_REPORT, hiddata, hdr->data_size, 1);
+	} else if (hdr->code == DMA_RX_CODE_FEATURE_REPORT) {
+		bool done = false;
+		mutex_lock(&ithc->hid_get_feature_mutex);
+		if (ithc->hid_get_feature_buf) {
+			if (hdr->data_size < ithc->hid_get_feature_size) ithc->hid_get_feature_size = hdr->data_size;
+			memcpy(ithc->hid_get_feature_buf, hiddata, ithc->hid_get_feature_size);
+			ithc->hid_get_feature_buf = NULL;
+			done = true;
+		}
+		mutex_unlock(&ithc->hid_get_feature_mutex);
+		if (done) wake_up(&ithc->wait_hid_get_feature);
+		else CHECK(hid_input_report, ithc->hid, HID_FEATURE_REPORT, hiddata, hdr->data_size, 1);
+	} else {
+		pci_dbg(ithc->pci, "unhandled dma rx data! channel %u, buffer %u, size %u, code %u\n", channel, buf, len, hdr->code);
+		print_hex_dump_debug(DEVNAME " data: ", DUMP_PREFIX_OFFSET, 32, 1, hdr, min(len, 0x400u), 0);
+	}
+	return 0;
+}
+
+static int ithc_dma_rx_unlocked(struct ithc *ithc, u8 channel) {
+	struct ithc_dma_rx *rx = &ithc->dma_rx[channel];
+	unsigned n = rx->num_received;
+	u8 head_wrap = readb(&ithc->regs->dma_rx[channel].head);
+	while (1) {
+		u8 tail = n % NUM_RX_BUF;
+		u8 tail_wrap = tail | ((n / NUM_RX_BUF) & 1 ? 0 : DMA_RX_WRAP_FLAG);
+		writeb(tail_wrap, &ithc->regs->dma_rx[channel].tail);
+		// ringbuffer is full if tail_wrap == head_wrap
+		// ringbuffer is empty if tail_wrap == head_wrap ^ WRAP_FLAG
+		if (tail_wrap == (head_wrap ^ DMA_RX_WRAP_FLAG)) return 0;
+
+		// take the buffer that the device just filled
+		struct ithc_dma_data_buffer *b = &rx->bufs[n % NUM_RX_BUF];
+		CHECK_RET(ithc_dma_data_buffer_get, ithc, &rx->prds, b, tail);
+		rx->num_received = ++n;
+
+		// process data
+		CHECK(ithc_dma_rx_process_buf, ithc, b, channel, tail);
+
+		// give the buffer back to the device
+		CHECK_RET(ithc_dma_data_buffer_put, ithc, &rx->prds, b, tail);
+	}
+}
+int ithc_dma_rx(struct ithc *ithc, u8 channel) {
+	struct ithc_dma_rx *rx = &ithc->dma_rx[channel];
+	mutex_lock(&rx->mutex);
+	int ret = ithc_dma_rx_unlocked(ithc, channel);
+	mutex_unlock(&rx->mutex);
+	return ret;
+}
+
+static int ithc_dma_tx_unlocked(struct ithc *ithc, u32 cmdcode, u32 datasize, void *data) {
+	pci_dbg(ithc->pci, "dma tx command %u, size %u\n", cmdcode, datasize);
+	struct ithc_dma_tx_header *hdr;
+	u8 padding = datasize & 3 ? 4 - (datasize & 3) : 0;
+	unsigned fullsize = sizeof *hdr + datasize + padding;
+	if (fullsize > ithc->dma_tx.max_size || fullsize > PAGE_SIZE) return -EINVAL;
+	CHECK_RET(ithc_dma_data_buffer_get, ithc, &ithc->dma_tx.prds, &ithc->dma_tx.buf, 0);
+
+	ithc->dma_tx.buf.data_size = fullsize;
+	hdr = ithc->dma_tx.buf.addr;
+	hdr->code = cmdcode;
+	hdr->data_size = datasize;
+	u8 *dest = (void *)(hdr + 1);
+	memcpy(dest, data, datasize);
+	dest += datasize;
+	for (u8 p = 0; p < padding; p++) *dest++ = 0;
+	CHECK_RET(ithc_dma_data_buffer_put, ithc, &ithc->dma_tx.prds, &ithc->dma_tx.buf, 0);
+
+	bitsb_set(&ithc->regs->dma_tx.control, DMA_TX_CONTROL_SEND);
+	CHECK_RET(waitb, ithc, &ithc->regs->dma_tx.control, DMA_TX_CONTROL_SEND, 0);
+	writel(DMA_TX_STATUS_DONE, &ithc->regs->dma_tx.status);
+	return 0;
+}
+int ithc_dma_tx(struct ithc *ithc, u32 cmdcode, u32 datasize, void *data) {
+	mutex_lock(&ithc->dma_tx.mutex);
+	int ret = ithc_dma_tx_unlocked(ithc, cmdcode, datasize, data);
+	mutex_unlock(&ithc->dma_tx.mutex);
+	return ret;
+}
+
diff --git a/drivers/hid/ithc/ithc-dma.h b/drivers/hid/ithc/ithc-dma.h
new file mode 100644
index 000000000000..d9f2c19a13f3
--- /dev/null
+++ b/drivers/hid/ithc/ithc-dma.h
@@ -0,0 +1,67 @@
+#define PRD_SIZE_MASK            0xffffff
+#define PRD_FLAG_END             0x1000000
+#define PRD_FLAG_SUCCESS         0x2000000
+#define PRD_FLAG_ERROR           0x4000000
+
+struct ithc_phys_region_desc {
+	u64 addr; // physical addr/1024
+	u32 size; // num bytes, PRD_FLAG_END marks last prd for data split over multiple prds
+	u32 unused;
+};
+
+#define DMA_RX_CODE_INPUT_REPORT          3
+#define DMA_RX_CODE_FEATURE_REPORT        4
+#define DMA_RX_CODE_REPORT_DESCRIPTOR     5
+#define DMA_RX_CODE_RESET                 7
+
+struct ithc_dma_rx_header {
+	u32 code;
+	u32 data_size;
+	u32 _unknown[14];
+};
+
+#define DMA_TX_CODE_SET_FEATURE           3
+#define DMA_TX_CODE_GET_FEATURE           4
+#define DMA_TX_CODE_OUTPUT_REPORT         5
+#define DMA_TX_CODE_GET_REPORT_DESCRIPTOR 7
+
+struct ithc_dma_tx_header {
+	u32 code;
+	u32 data_size;
+};
+
+struct ithc_dma_prd_buffer {
+	void *addr;
+	dma_addr_t dma_addr;
+	u32 size;
+	u32 num_pages; // per data buffer
+	enum dma_data_direction dir;
+};
+
+struct ithc_dma_data_buffer {
+	void *addr;
+	struct sg_table *sgt;
+	int active_idx;
+	u32 data_size;
+};
+
+struct ithc_dma_tx {
+	struct mutex mutex;
+	u32 max_size;
+	struct ithc_dma_prd_buffer prds;
+	struct ithc_dma_data_buffer buf;
+};
+
+struct ithc_dma_rx {
+	struct mutex mutex;
+	u32 num_received;
+	struct ithc_dma_prd_buffer prds;
+	struct ithc_dma_data_buffer bufs[NUM_RX_BUF];
+};
+
+int ithc_dma_rx_init(struct ithc *ithc, u8 channel, const char *devname);
+void ithc_dma_rx_enable(struct ithc *ithc, u8 channel);
+int ithc_dma_tx_init(struct ithc *ithc);
+int ithc_dma_rx(struct ithc *ithc, u8 channel);
+int ithc_dma_tx(struct ithc *ithc, u32 cmdcode, u32 datasize, void *cmddata);
+
diff --git a/drivers/hid/ithc/ithc-main.c b/drivers/hid/ithc/ithc-main.c
new file mode 100644
index 000000000000..09512b9cb4d3
--- /dev/null
+++ b/drivers/hid/ithc/ithc-main.c
@@ -0,0 +1,534 @@
+#include "ithc.h"
+
+MODULE_DESCRIPTION("Intel Touch Host Controller driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+// Lakefield
+#define PCI_DEVICE_ID_INTEL_THC_LKF_PORT1    0x98d0
+#define PCI_DEVICE_ID_INTEL_THC_LKF_PORT2    0x98d1
+// Tiger Lake
+#define PCI_DEVICE_ID_INTEL_THC_TGL_LP_PORT1 0xa0d0
+#define PCI_DEVICE_ID_INTEL_THC_TGL_LP_PORT2 0xa0d1
+#define PCI_DEVICE_ID_INTEL_THC_TGL_H_PORT1  0x43d0
+#define PCI_DEVICE_ID_INTEL_THC_TGL_H_PORT2  0x43d1
+// Alder Lake
+#define PCI_DEVICE_ID_INTEL_THC_ADL_S_PORT1  0x7ad8
+#define PCI_DEVICE_ID_INTEL_THC_ADL_S_PORT2  0x7ad9
+#define PCI_DEVICE_ID_INTEL_THC_ADL_P_PORT1  0x51d0
+#define PCI_DEVICE_ID_INTEL_THC_ADL_P_PORT2  0x51d1
+#define PCI_DEVICE_ID_INTEL_THC_ADL_M_PORT1  0x54d0
+#define PCI_DEVICE_ID_INTEL_THC_ADL_M_PORT2  0x54d1
+// Raptor Lake
+#define PCI_DEVICE_ID_INTEL_THC_RPL_S_PORT1  0x7a58
+#define PCI_DEVICE_ID_INTEL_THC_RPL_S_PORT2  0x7a59
+// Meteor Lake
+#define PCI_DEVICE_ID_INTEL_THC_MTL_PORT1    0x7e48
+#define PCI_DEVICE_ID_INTEL_THC_MTL_PORT2    0x7e4a
+
+static const struct pci_device_id ithc_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_LKF_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_LKF_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_TGL_LP_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_TGL_LP_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_TGL_H_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_TGL_H_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_ADL_S_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_ADL_S_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_ADL_P_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_ADL_P_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_ADL_M_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_ADL_M_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_RPL_S_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_RPL_S_PORT2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_MTL_PORT1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THC_MTL_PORT2) },
+	{}
+};
+MODULE_DEVICE_TABLE(pci, ithc_pci_tbl);
+
+// Module parameters
+
+static bool ithc_use_polling = false;
+module_param_named(poll, ithc_use_polling, bool, 0);
+MODULE_PARM_DESC(poll, "Use polling instead of interrupts");
+
+static bool ithc_use_rx0 = false;
+module_param_named(rx0, ithc_use_rx0, bool, 0);
+MODULE_PARM_DESC(rx0, "Use DMA RX channel 0");
+
+static bool ithc_use_rx1 = true;
+module_param_named(rx1, ithc_use_rx1, bool, 0);
+MODULE_PARM_DESC(rx1, "Use DMA RX channel 1");
+
+static bool ithc_log_regs_enabled = false;
+module_param_named(logregs, ithc_log_regs_enabled, bool, 0);
+MODULE_PARM_DESC(logregs, "Log changes in register values (for debugging)");
+
+// Sysfs attributes
+
+static bool ithc_is_config_valid(struct ithc *ithc) {
+	return ithc->config.device_id == DEVCFG_DEVICE_ID_TIC;
+}
+
+static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, char *buf) {
+	struct ithc *ithc = dev_get_drvdata(dev);
+	if (!ithc || !ithc_is_config_valid(ithc)) return -ENODEV;
+	return sprintf(buf, "0x%04x", ithc->config.vendor_id);
+}
+static DEVICE_ATTR_RO(vendor);
+static ssize_t product_show(struct device *dev, struct device_attribute *attr, char *buf) {
+	struct ithc *ithc = dev_get_drvdata(dev);
+	if (!ithc || !ithc_is_config_valid(ithc)) return -ENODEV;
+	return sprintf(buf, "0x%04x", ithc->config.product_id);
+}
+static DEVICE_ATTR_RO(product);
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr, char *buf) {
+	struct ithc *ithc = dev_get_drvdata(dev);
+	if (!ithc || !ithc_is_config_valid(ithc)) return -ENODEV;
+	return sprintf(buf, "%u", ithc->config.revision);
+}
+static DEVICE_ATTR_RO(revision);
+static ssize_t fw_version_show(struct device *dev, struct device_attribute *attr, char *buf) {
+	struct ithc *ithc = dev_get_drvdata(dev);
+	if (!ithc || !ithc_is_config_valid(ithc)) return -ENODEV;
+	u32 v = ithc->config.fw_version;
+	return sprintf(buf, "%i.%i.%i.%i", v >> 24, v >> 16 & 0xff, v >> 8 & 0xff, v & 0xff);
+}
+static DEVICE_ATTR_RO(fw_version);
+
+static const struct attribute_group *ithc_attribute_groups[] = {
+	&(const struct attribute_group){
+		.name = DEVNAME,
+		.attrs = (struct attribute *[]){
+			&dev_attr_vendor.attr,
+			&dev_attr_product.attr,
+			&dev_attr_revision.attr,
+			&dev_attr_fw_version.attr,
+			NULL
+		},
+	},
+	NULL
+};
+
+// HID setup
+
+static int ithc_hid_start(struct hid_device *hdev) { return 0; }
+static void ithc_hid_stop(struct hid_device *hdev) { }
+static int ithc_hid_open(struct hid_device *hdev) { return 0; }
+static void ithc_hid_close(struct hid_device *hdev) { }
+
+static int ithc_hid_parse(struct hid_device *hdev) {
+	struct ithc *ithc = hdev->driver_data;
+	u64 val = 0;
+	WRITE_ONCE(ithc->hid_parse_done, false);
+	CHECK_RET(ithc_dma_tx, ithc, DMA_TX_CODE_GET_REPORT_DESCRIPTOR, sizeof val, &val);
+	if (!wait_event_timeout(ithc->wait_hid_parse, READ_ONCE(ithc->hid_parse_done), msecs_to_jiffies(1000))) return -ETIMEDOUT;
+	return 0;
+}
+
+static int ithc_hid_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype) {
+	struct ithc *ithc = hdev->driver_data;
+	if (!buf || !len) return -EINVAL;
+	u32 code;
+	if (rtype == HID_OUTPUT_REPORT && reqtype == HID_REQ_SET_REPORT) code = DMA_TX_CODE_OUTPUT_REPORT;
+	else if (rtype == HID_FEATURE_REPORT && reqtype == HID_REQ_SET_REPORT) code = DMA_TX_CODE_SET_FEATURE;
+	else if (rtype == HID_FEATURE_REPORT && reqtype == HID_REQ_GET_REPORT) code = DMA_TX_CODE_GET_FEATURE;
+	else {
+		pci_err(ithc->pci, "unhandled hid request %i %i for report id %i\n", rtype, reqtype, reportnum);
+		return -EINVAL;
+	}
+	buf[0] = reportnum;
+	if (reqtype == HID_REQ_GET_REPORT) {
+		mutex_lock(&ithc->hid_get_feature_mutex);
+		ithc->hid_get_feature_buf = buf;
+		ithc->hid_get_feature_size = len;
+		mutex_unlock(&ithc->hid_get_feature_mutex);
+		int r = CHECK(ithc_dma_tx, ithc, code, 1, buf);
+		if (!r) {
+			r = wait_event_interruptible_timeout(ithc->wait_hid_get_feature, !ithc->hid_get_feature_buf, msecs_to_jiffies(1000));
+			if (!r) r = -ETIMEDOUT;
+			else if (r < 0) r = -EINTR;
+			else r = 0;
+		}
+		mutex_lock(&ithc->hid_get_feature_mutex);
+		ithc->hid_get_feature_buf = NULL;
+		if (!r) r = ithc->hid_get_feature_size;
+		mutex_unlock(&ithc->hid_get_feature_mutex);
+		return r;
+	}
+	CHECK_RET(ithc_dma_tx, ithc, code, len, buf);
+	return 0;
+}
+
+static struct hid_ll_driver ithc_ll_driver = {
+	.start = ithc_hid_start,
+	.stop = ithc_hid_stop,
+	.open = ithc_hid_open,
+	.close = ithc_hid_close,
+	.parse = ithc_hid_parse,
+	.raw_request = ithc_hid_raw_request,
+};
+
+static void ithc_hid_devres_release(struct device *dev, void *res) {
+	struct hid_device **hidm = res;
+	if (*hidm) hid_destroy_device(*hidm);
+}
+
+static int ithc_hid_init(struct ithc *ithc) {
+	struct hid_device **hidm = devres_alloc(ithc_hid_devres_release, sizeof *hidm, GFP_KERNEL);
+	if (!hidm) return -ENOMEM;
+	devres_add(&ithc->pci->dev, hidm);
+	struct hid_device *hid = hid_allocate_device();
+	if (IS_ERR(hid)) return PTR_ERR(hid);
+	*hidm = hid;
+
+	strscpy(hid->name, DEVFULLNAME, sizeof(hid->name));
+	strscpy(hid->phys, ithc->phys, sizeof(hid->phys));
+	hid->ll_driver = &ithc_ll_driver;
+	hid->bus = BUS_PCI;
+	hid->vendor = ithc->config.vendor_id;
+	hid->product = ithc->config.product_id;
+	hid->version = 0x100;
+	hid->dev.parent = &ithc->pci->dev;
+	hid->driver_data = ithc;
+
+	ithc->hid = hid;
+	return 0;
+}
+
+// Interrupts/polling
+
+static void ithc_activity_timer_callback(struct timer_list *t) {
+	struct ithc *ithc = container_of(t, struct ithc, activity_timer);
+	cpu_latency_qos_update_request(&ithc->activity_qos, PM_QOS_DEFAULT_VALUE);
+}
+
+void ithc_set_active(struct ithc *ithc) {
+	// When CPU usage is very low, the CPU can enter various low power states (C2-C10).
+	// This disrupts DMA, causing truncated DMA messages. ERROR_FLAG_DMA_UNKNOWN_12 will be set when this happens.
+	// The amount of truncated messages can become very high, resulting in user-visible effects (laggy/stuttering cursor).
+	// To avoid this, we use a CPU latency QoS request to prevent the CPU from entering low power states during touch interactions.
+	cpu_latency_qos_update_request(&ithc->activity_qos, 0);
+	mod_timer(&ithc->activity_timer, jiffies + msecs_to_jiffies(1000));
+}
+
+static int ithc_set_device_enabled(struct ithc *ithc, bool enable) {
+	u32 x = ithc->config.touch_cfg = (ithc->config.touch_cfg & ~(u32)DEVCFG_TOUCH_MASK) | DEVCFG_TOUCH_UNKNOWN_2
+		| (enable ? DEVCFG_TOUCH_ENABLE | DEVCFG_TOUCH_UNKNOWN_3 | DEVCFG_TOUCH_UNKNOWN_4 : 0);
+	return ithc_spi_command(ithc, SPI_CMD_CODE_WRITE, offsetof(struct ithc_device_config, touch_cfg), sizeof x, &x);
+}
+
+static void ithc_disable_interrupts(struct ithc *ithc) {
+	writel(0, &ithc->regs->error_control);
+	bitsb(&ithc->regs->spi_cmd.control, SPI_CMD_CONTROL_IRQ, 0);
+	bitsb(&ithc->regs->dma_rx[0].control, DMA_RX_CONTROL_IRQ_UNKNOWN_1 | DMA_RX_CONTROL_IRQ_ERROR | DMA_RX_CONTROL_IRQ_UNKNOWN_4 | DMA_RX_CONTROL_IRQ_DATA, 0);
+	bitsb(&ithc->regs->dma_rx[1].control, DMA_RX_CONTROL_IRQ_UNKNOWN_1 | DMA_RX_CONTROL_IRQ_ERROR | DMA_RX_CONTROL_IRQ_UNKNOWN_4 | DMA_RX_CONTROL_IRQ_DATA, 0);
+	bitsb(&ithc->regs->dma_tx.control, DMA_TX_CONTROL_IRQ, 0);
+}
+
+static void ithc_clear_dma_rx_interrupts(struct ithc *ithc, unsigned channel) {
+	writel(DMA_RX_STATUS_ERROR | DMA_RX_STATUS_UNKNOWN_4 | DMA_RX_STATUS_HAVE_DATA, &ithc->regs->dma_rx[channel].status);
+}
+
+static void ithc_clear_interrupts(struct ithc *ithc) {
+	writel(0xffffffff, &ithc->regs->error_flags);
+	writel(ERROR_STATUS_DMA | ERROR_STATUS_SPI, &ithc->regs->error_status);
+	writel(SPI_CMD_STATUS_DONE | SPI_CMD_STATUS_ERROR, &ithc->regs->spi_cmd.status);
+	ithc_clear_dma_rx_interrupts(ithc, 0);
+	ithc_clear_dma_rx_interrupts(ithc, 1);
+	writel(DMA_TX_STATUS_DONE | DMA_TX_STATUS_ERROR | DMA_TX_STATUS_UNKNOWN_2, &ithc->regs->dma_tx.status);
+}
+
+static void ithc_process(struct ithc *ithc) {
+	ithc_log_regs(ithc);
+
+	// read and clear error bits
+	u32 err = readl(&ithc->regs->error_flags);
+	if (err) {
+		if (err & ~ERROR_FLAG_DMA_UNKNOWN_12) pci_err(ithc->pci, "error flags: 0x%08x\n", err);
+		writel(err, &ithc->regs->error_flags);
+	}
+
+	// process DMA rx
+	if (ithc_use_rx0) {
+		ithc_clear_dma_rx_interrupts(ithc, 0);
+		ithc_dma_rx(ithc, 0);
+	}
+	if (ithc_use_rx1) {
+		ithc_clear_dma_rx_interrupts(ithc, 1);
+		ithc_dma_rx(ithc, 1);
+	}
+
+	ithc_log_regs(ithc);
+}
+
+static irqreturn_t ithc_interrupt_thread(int irq, void *arg) {
+	struct ithc *ithc = arg;
+	pci_dbg(ithc->pci, "IRQ! err=%08x/%08x/%08x, cmd=%02x/%08x, rx0=%02x/%08x, rx1=%02x/%08x, tx=%02x/%08x\n",
+		readl(&ithc->regs->error_control), readl(&ithc->regs->error_status), readl(&ithc->regs->error_flags),
+		readb(&ithc->regs->spi_cmd.control), readl(&ithc->regs->spi_cmd.status),
+		readb(&ithc->regs->dma_rx[0].control), readl(&ithc->regs->dma_rx[0].status),
+		readb(&ithc->regs->dma_rx[1].control), readl(&ithc->regs->dma_rx[1].status),
+		readb(&ithc->regs->dma_tx.control), readl(&ithc->regs->dma_tx.status));
+	ithc_process(ithc);
+	return IRQ_HANDLED;
+}
+
+static int ithc_poll_thread(void *arg) {
+	struct ithc *ithc = arg;
+	unsigned sleep = 100;
+	while (!kthread_should_stop()) {
+		u32 n = ithc->dma_rx[1].num_received;
+		ithc_process(ithc);
+		if (n != ithc->dma_rx[1].num_received) sleep = 20;
+		else sleep = min(200u, sleep + (sleep >> 4) + 1);
+		msleep_interruptible(sleep);
+	}
+	return 0;
+}
+
+// Device initialization and shutdown
+
+static void ithc_disable(struct ithc *ithc) {
+	bitsl_set(&ithc->regs->control_bits, CONTROL_QUIESCE);
+	CHECK(waitl, ithc, &ithc->regs->control_bits, CONTROL_IS_QUIESCED, CONTROL_IS_QUIESCED);
+	bitsl(&ithc->regs->control_bits, CONTROL_NRESET, 0);
+	bitsb(&ithc->regs->spi_cmd.control, SPI_CMD_CONTROL_SEND, 0);
+	bitsb(&ithc->regs->dma_tx.control, DMA_TX_CONTROL_SEND, 0);
+	bitsb(&ithc->regs->dma_rx[0].control, DMA_RX_CONTROL_ENABLE, 0);
+	bitsb(&ithc->regs->dma_rx[1].control, DMA_RX_CONTROL_ENABLE, 0);
+	ithc_disable_interrupts(ithc);
+	ithc_clear_interrupts(ithc);
+}
+
+static int ithc_init_device(struct ithc *ithc) {
+	ithc_log_regs(ithc);
+	bool was_enabled = (readl(&ithc->regs->control_bits) & CONTROL_NRESET) != 0;
+	ithc_disable(ithc);
+	CHECK_RET(waitl, ithc, &ithc->regs->control_bits, CONTROL_READY, CONTROL_READY);
+	ithc_set_spi_config(ithc, 10, 0);
+	bitsl_set(&ithc->regs->dma_rx[0].unknown_init_bits, 0x80000000); // seems to help with reading config
+
+	if (was_enabled) if (msleep_interruptible(100)) return -EINTR;
+	bitsl(&ithc->regs->control_bits, CONTROL_QUIESCE, 0);
+	CHECK_RET(waitl, ithc, &ithc->regs->control_bits, CONTROL_IS_QUIESCED, 0);
+	for (int retries = 0; ; retries++) {
+		ithc_log_regs(ithc);
+		bitsl_set(&ithc->regs->control_bits, CONTROL_NRESET);
+		if (!waitl(ithc, &ithc->regs->state, 0xf, 2)) break;
+		if (retries > 5) {
+			pci_err(ithc->pci, "too many retries, failed to reset device\n");
+			return -ETIMEDOUT;
+		}
+		pci_err(ithc->pci, "invalid state, retrying reset\n");
+		bitsl(&ithc->regs->control_bits, CONTROL_NRESET, 0);
+		if (msleep_interruptible(1000)) return -EINTR;
+	}
+	ithc_log_regs(ithc);
+
+	CHECK(waitl, ithc, &ithc->regs->dma_rx[0].status, DMA_RX_STATUS_UNKNOWN_4, DMA_RX_STATUS_UNKNOWN_4);
+
+	// read config
+	for (int retries = 0; ; retries++) {
+		ithc_log_regs(ithc);
+		memset(&ithc->config, 0, sizeof ithc->config);
+		CHECK_RET(ithc_spi_command, ithc, SPI_CMD_CODE_READ, 0, sizeof ithc->config, &ithc->config);
+		u32 *p = (void *)&ithc->config;
+		pci_info(ithc->pci, "config: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+			p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+		if (ithc_is_config_valid(ithc)) break;
+		if (retries > 10) {
+			pci_err(ithc->pci, "failed to read config, unknown device ID 0x%08x\n", ithc->config.device_id);
+			return -EIO;
+		}
+		pci_err(ithc->pci, "failed to read config, retrying\n");
+		if (msleep_interruptible(100)) return -EINTR;
+	}
+	ithc_log_regs(ithc);
+
+	CHECK_RET(ithc_set_spi_config, ithc, DEVCFG_SPI_MAX_FREQ(ithc->config.spi_config), DEVCFG_SPI_MODE(ithc->config.spi_config));
+	CHECK_RET(ithc_set_device_enabled, ithc, true);
+	ithc_log_regs(ithc);
+	return 0;
+}
+
+int ithc_reset(struct ithc *ithc) {
+	// FIXME This should probably do devres_release_group()+ithc_start(). But because this is called during DMA
+	// processing, that would have to be done asynchronously (schedule_work()?). And with extra locking?
+	pci_err(ithc->pci, "reset\n");
+	CHECK(ithc_init_device, ithc);
+	if (ithc_use_rx0) ithc_dma_rx_enable(ithc, 0);
+	if (ithc_use_rx1) ithc_dma_rx_enable(ithc, 1);
+	ithc_log_regs(ithc);
+	pci_dbg(ithc->pci, "reset completed\n");
+	return 0;
+}
+
+static void ithc_stop(void *res) {
+	struct ithc *ithc = res;
+	pci_dbg(ithc->pci, "stopping\n");
+	ithc_log_regs(ithc);
+	if (ithc->poll_thread) CHECK(kthread_stop, ithc->poll_thread);
+	if (ithc->irq >= 0) disable_irq(ithc->irq);
+	CHECK(ithc_set_device_enabled, ithc, false);
+	ithc_disable(ithc);
+	del_timer_sync(&ithc->activity_timer);
+	cpu_latency_qos_remove_request(&ithc->activity_qos);
+	// clear dma config
+	for(unsigned i = 0; i < 2; i++) {
+		CHECK(waitl, ithc, &ithc->regs->dma_rx[i].status, DMA_RX_STATUS_ENABLED, 0);
+		lo_hi_writeq(0, &ithc->regs->dma_rx[i].addr);
+		writeb(0, &ithc->regs->dma_rx[i].num_bufs);
+		writeb(0, &ithc->regs->dma_rx[i].num_prds);
+	}
+	lo_hi_writeq(0, &ithc->regs->dma_tx.addr);
+	writeb(0, &ithc->regs->dma_tx.num_prds);
+	ithc_log_regs(ithc);
+	pci_dbg(ithc->pci, "stopped\n");
+}
+
+static void ithc_clear_drvdata(void *res) {
+	struct pci_dev *pci = res;
+	pci_set_drvdata(pci, NULL);
+}
+
+static int ithc_start(struct pci_dev *pci) {
+	pci_dbg(pci, "starting\n");
+	if (pci_get_drvdata(pci)) {
+		pci_err(pci, "device already initialized\n");
+		return -EINVAL;
+	}
+	if (!devres_open_group(&pci->dev, ithc_start, GFP_KERNEL)) return -ENOMEM;
+
+	struct ithc *ithc = devm_kzalloc(&pci->dev, sizeof *ithc, GFP_KERNEL);
+	if (!ithc) return -ENOMEM;
+	ithc->irq = -1;
+	ithc->pci = pci;
+	snprintf(ithc->phys, sizeof ithc->phys, "pci-%s/" DEVNAME, pci_name(pci));
+	init_waitqueue_head(&ithc->wait_hid_parse);
+	init_waitqueue_head(&ithc->wait_hid_get_feature);
+	mutex_init(&ithc->hid_get_feature_mutex);
+	pci_set_drvdata(pci, ithc);
+	CHECK_RET(devm_add_action_or_reset, &pci->dev, ithc_clear_drvdata, pci);
+	if (ithc_log_regs_enabled) ithc->prev_regs = devm_kzalloc(&pci->dev, sizeof *ithc->prev_regs, GFP_KERNEL);
+
+	CHECK_RET(pcim_enable_device, pci);
+	pci_set_master(pci);
+	CHECK_RET(pcim_iomap_regions, pci, BIT(0), DEVNAME " regs");
+	CHECK_RET(dma_set_mask_and_coherent, &pci->dev, DMA_BIT_MASK(64));
+	CHECK_RET(pci_set_power_state, pci, PCI_D0);
+	ithc->regs = pcim_iomap_table(pci)[0];
+
+	if (!ithc_use_polling) {
+		CHECK_RET(pci_alloc_irq_vectors, pci, 1, 1, PCI_IRQ_MSI | PCI_IRQ_MSIX);
+		ithc->irq = CHECK(pci_irq_vector, pci, 0);
+		if (ithc->irq < 0) return ithc->irq;
+	}
+
+	CHECK_RET(ithc_init_device, ithc);
+	CHECK(devm_device_add_groups, &pci->dev, ithc_attribute_groups);
+	if (ithc_use_rx0) CHECK_RET(ithc_dma_rx_init, ithc, 0, ithc_use_rx1 ? DEVNAME "0" : DEVNAME);
+	if (ithc_use_rx1) CHECK_RET(ithc_dma_rx_init, ithc, 1, ithc_use_rx0 ? DEVNAME "1" : DEVNAME);
+	CHECK_RET(ithc_dma_tx_init, ithc);
+
+	CHECK_RET(ithc_hid_init, ithc);
+
+	cpu_latency_qos_add_request(&ithc->activity_qos, PM_QOS_DEFAULT_VALUE);
+	timer_setup(&ithc->activity_timer, ithc_activity_timer_callback, 0);
+
+	// add ithc_stop callback AFTER setting up DMA buffers, so that polling/irqs/DMA are disabled BEFORE the buffers are freed
+	CHECK_RET(devm_add_action_or_reset, &pci->dev, ithc_stop, ithc);
+
+	if (ithc_use_polling) {
+		pci_info(pci, "using polling instead of irq\n");
+		// use a thread instead of simple timer because we want to be able to sleep
+		ithc->poll_thread = kthread_run(ithc_poll_thread, ithc, DEVNAME "poll");
+		if (IS_ERR(ithc->poll_thread)) {
+			int err = PTR_ERR(ithc->poll_thread);
+			ithc->poll_thread = NULL;
+			return err;
+		}
+	} else {
+		CHECK_RET(devm_request_threaded_irq, &pci->dev, ithc->irq, NULL, ithc_interrupt_thread, IRQF_TRIGGER_HIGH | IRQF_ONESHOT, DEVNAME, ithc);
+	}
+
+	if (ithc_use_rx0) ithc_dma_rx_enable(ithc, 0);
+	if (ithc_use_rx1) ithc_dma_rx_enable(ithc, 1);
+
+	// hid_add_device can only be called after irq/polling is started and DMA is enabled, because it calls ithc_hid_parse which reads the report descriptor via DMA
+	CHECK_RET(hid_add_device, ithc->hid);
+
+	CHECK(ithc_debug_init, ithc);
+
+	pci_dbg(pci, "started\n");
+	return 0;
+}
+
+static int ithc_probe(struct pci_dev *pci, const struct pci_device_id *id) {
+	pci_dbg(pci, "device probe\n");
+	return ithc_start(pci);
+}
+
+static void ithc_remove(struct pci_dev *pci) {
+	pci_dbg(pci, "device remove\n");
+	// all cleanup is handled by devres
+}
+
+static int ithc_suspend(struct device *dev) {
+	struct pci_dev *pci = to_pci_dev(dev);
+	pci_dbg(pci, "pm suspend\n");
+	devres_release_group(dev, ithc_start);
+	return 0;
+}
+
+static int ithc_resume(struct device *dev) {
+	struct pci_dev *pci = to_pci_dev(dev);
+	pci_dbg(pci, "pm resume\n");
+	return ithc_start(pci);
+}
+
+static int ithc_freeze(struct device *dev) {
+	struct pci_dev *pci = to_pci_dev(dev);
+	pci_dbg(pci, "pm freeze\n");
+	devres_release_group(dev, ithc_start);
+	return 0;
+}
+
+static int ithc_thaw(struct device *dev) {
+	struct pci_dev *pci = to_pci_dev(dev);
+	pci_dbg(pci, "pm thaw\n");
+	return ithc_start(pci);
+}
+
+static int ithc_restore(struct device *dev) {
+	struct pci_dev *pci = to_pci_dev(dev);
+	pci_dbg(pci, "pm restore\n");
+	return ithc_start(pci);
+}
+
+static struct pci_driver ithc_driver = {
+	.name = DEVNAME,
+	.id_table = ithc_pci_tbl,
+	.probe = ithc_probe,
+	.remove = ithc_remove,
+	.driver.pm = &(const struct dev_pm_ops) {
+		.suspend = ithc_suspend,
+		.resume = ithc_resume,
+		.freeze = ithc_freeze,
+		.thaw = ithc_thaw,
+		.restore = ithc_restore,
+	},
+	//.dev_groups = ithc_attribute_groups, // could use this (since 5.14), however the attributes won't have valid values until config has been read anyway
+};
+
+static int __init ithc_init(void) {
+	return pci_register_driver(&ithc_driver);
+}
+
+static void __exit ithc_exit(void) {
+	pci_unregister_driver(&ithc_driver);
+}
+
+module_init(ithc_init);
+module_exit(ithc_exit);
+
diff --git a/drivers/hid/ithc/ithc-regs.c b/drivers/hid/ithc/ithc-regs.c
new file mode 100644
index 000000000000..85d567b05761
--- /dev/null
+++ b/drivers/hid/ithc/ithc-regs.c
@@ -0,0 +1,64 @@
+#include "ithc.h"
+
+#define reg_num(r) (0x1fff & (u16)(__force u64)(r))
+
+void bitsl(__iomem u32 *reg, u32 mask, u32 val) {
+	if (val & ~mask) pr_err("register 0x%x: invalid value 0x%x for bitmask 0x%x\n", reg_num(reg), val, mask);
+	writel((readl(reg) & ~mask) | (val & mask), reg);
+}
+
+void bitsb(__iomem u8 *reg, u8 mask, u8 val) {
+	if (val & ~mask) pr_err("register 0x%x: invalid value 0x%x for bitmask 0x%x\n", reg_num(reg), val, mask);
+	writeb((readb(reg) & ~mask) | (val & mask), reg);
+}
+
+int waitl(struct ithc *ithc, __iomem u32 *reg, u32 mask, u32 val) {
+	pci_dbg(ithc->pci, "waiting for reg 0x%04x mask 0x%08x val 0x%08x\n", reg_num(reg), mask, val);
+	u32 x;
+	if (readl_poll_timeout(reg, x, (x & mask) == val, 200, 1000*1000)) {
+		pci_err(ithc->pci, "timed out waiting for reg 0x%04x mask 0x%08x val 0x%08x\n", reg_num(reg), mask, val);
+		return -ETIMEDOUT;
+	}
+	pci_dbg(ithc->pci, "done waiting\n");
+	return 0;
+}
+
+int waitb(struct ithc *ithc, __iomem u8 *reg, u8 mask, u8 val) {
+	pci_dbg(ithc->pci, "waiting for reg 0x%04x mask 0x%02x val 0x%02x\n", reg_num(reg), mask, val);
+	u8 x;
+	if (readb_poll_timeout(reg, x, (x & mask) == val, 200, 1000*1000)) {
+		pci_err(ithc->pci, "timed out waiting for reg 0x%04x mask 0x%02x val 0x%02x\n", reg_num(reg), mask, val);
+		return -ETIMEDOUT;
+	}
+	pci_dbg(ithc->pci, "done waiting\n");
+	return 0;
+}
+
+int ithc_set_spi_config(struct ithc *ithc, u8 speed, u8 mode) {
+	pci_dbg(ithc->pci, "setting SPI speed to %i, mode %i\n", speed, mode);
+	if (mode == 3) mode = 2;
+	bitsl(&ithc->regs->spi_config,
+		SPI_CONFIG_MODE(0xff) | SPI_CONFIG_SPEED(0xff) | SPI_CONFIG_UNKNOWN_18(0xff) | SPI_CONFIG_SPEED2(0xff),
+		SPI_CONFIG_MODE(mode) | SPI_CONFIG_SPEED(speed) | SPI_CONFIG_UNKNOWN_18(0) | SPI_CONFIG_SPEED2(speed));
+	return 0;
+}
+
+int ithc_spi_command(struct ithc *ithc, u8 command, u32 offset, u32 size, void *data) {
+	pci_dbg(ithc->pci, "SPI command %u, size %u, offset %u\n", command, size, offset);
+	if (size > sizeof ithc->regs->spi_cmd.data) return -EINVAL;
+	CHECK_RET(waitl, ithc, &ithc->regs->spi_cmd.status, SPI_CMD_STATUS_BUSY, 0);
+	writel(SPI_CMD_STATUS_DONE | SPI_CMD_STATUS_ERROR, &ithc->regs->spi_cmd.status);
+	writeb(command, &ithc->regs->spi_cmd.code);
+	writew(size, &ithc->regs->spi_cmd.size);
+	writel(offset, &ithc->regs->spi_cmd.offset);
+	u32 *p = data, n = (size + 3) / 4;
+	for (u32 i = 0; i < n; i++) writel(p[i], &ithc->regs->spi_cmd.data[i]);
+	bitsb_set(&ithc->regs->spi_cmd.control, SPI_CMD_CONTROL_SEND);
+	CHECK_RET(waitl, ithc, &ithc->regs->spi_cmd.status, SPI_CMD_STATUS_BUSY, 0);
+	if ((readl(&ithc->regs->spi_cmd.status) & (SPI_CMD_STATUS_DONE | SPI_CMD_STATUS_ERROR)) != SPI_CMD_STATUS_DONE) return -EIO;
+	if (readw(&ithc->regs->spi_cmd.size) != size) return -EMSGSIZE;
+	for (u32 i = 0; i < n; i++) p[i] = readl(&ithc->regs->spi_cmd.data[i]);
+	writel(SPI_CMD_STATUS_DONE | SPI_CMD_STATUS_ERROR, &ithc->regs->spi_cmd.status);
+	return 0;
+}
+
diff --git a/drivers/hid/ithc/ithc-regs.h b/drivers/hid/ithc/ithc-regs.h
new file mode 100644
index 000000000000..1a96092ed7ee
--- /dev/null
+++ b/drivers/hid/ithc/ithc-regs.h
@@ -0,0 +1,186 @@
+#define CONTROL_QUIESCE                     BIT(1)
+#define CONTROL_IS_QUIESCED                 BIT(2)
+#define CONTROL_NRESET                      BIT(3)
+#define CONTROL_READY                       BIT(29)
+
+#define SPI_CONFIG_MODE(x)                  (((x) & 3) << 2)
+#define SPI_CONFIG_SPEED(x)                 (((x) & 7) << 4)
+#define SPI_CONFIG_UNKNOWN_18(x)            (((x) & 3) << 18)
+#define SPI_CONFIG_SPEED2(x)                (((x) & 0xf) << 20) // high bit = high speed mode?
+
+#define ERROR_CONTROL_UNKNOWN_0             BIT(0)
+#define ERROR_CONTROL_DISABLE_DMA           BIT(1) // clears DMA_RX_CONTROL_ENABLE when a DMA error occurs
+#define ERROR_CONTROL_UNKNOWN_2             BIT(2)
+#define ERROR_CONTROL_UNKNOWN_3             BIT(3)
+#define ERROR_CONTROL_IRQ_DMA_UNKNOWN_9     BIT(9)
+#define ERROR_CONTROL_IRQ_DMA_UNKNOWN_10    BIT(10)
+#define ERROR_CONTROL_IRQ_DMA_UNKNOWN_12    BIT(12)
+#define ERROR_CONTROL_IRQ_DMA_UNKNOWN_13    BIT(13)
+#define ERROR_CONTROL_UNKNOWN_16(x)         (((x) & 0xff) << 16) // spi error code irq?
+#define ERROR_CONTROL_SET_DMA_STATUS        BIT(29) // sets DMA_RX_STATUS_ERROR when a DMA error occurs
+
+#define ERROR_STATUS_DMA                    BIT(28)
+#define ERROR_STATUS_SPI                    BIT(30)
+
+#define ERROR_FLAG_DMA_UNKNOWN_9            BIT(9)
+#define ERROR_FLAG_DMA_UNKNOWN_10           BIT(10)
+#define ERROR_FLAG_DMA_UNKNOWN_12           BIT(12) // set when we receive a truncated DMA message
+#define ERROR_FLAG_DMA_UNKNOWN_13           BIT(13)
+#define ERROR_FLAG_SPI_BUS_TURNAROUND       BIT(16)
+#define ERROR_FLAG_SPI_RESPONSE_TIMEOUT     BIT(17)
+#define ERROR_FLAG_SPI_INTRA_PACKET_TIMEOUT BIT(18)
+#define ERROR_FLAG_SPI_INVALID_RESPONSE     BIT(19)
+#define ERROR_FLAG_SPI_HS_RX_TIMEOUT        BIT(20)
+#define ERROR_FLAG_SPI_TOUCH_IC_INIT        BIT(21)
+
+#define SPI_CMD_CONTROL_SEND                BIT(0) // cleared by device when sending is complete
+#define SPI_CMD_CONTROL_IRQ                 BIT(1)
+
+#define SPI_CMD_CODE_READ                   4
+#define SPI_CMD_CODE_WRITE                  6
+
+#define SPI_CMD_STATUS_DONE                 BIT(0)
+#define SPI_CMD_STATUS_ERROR                BIT(1)
+#define SPI_CMD_STATUS_BUSY                 BIT(3)
+
+#define DMA_TX_CONTROL_SEND                 BIT(0) // cleared by device when sending is complete
+#define DMA_TX_CONTROL_IRQ                  BIT(3)
+
+#define DMA_TX_STATUS_DONE                  BIT(0)
+#define DMA_TX_STATUS_ERROR                 BIT(1)
+#define DMA_TX_STATUS_UNKNOWN_2             BIT(2)
+#define DMA_TX_STATUS_UNKNOWN_3             BIT(3) // busy?
+
+#define DMA_RX_CONTROL_ENABLE               BIT(0)
+#define DMA_RX_CONTROL_IRQ_UNKNOWN_1        BIT(1) // rx1 only?
+#define DMA_RX_CONTROL_IRQ_ERROR            BIT(3) // rx1 only?
+#define DMA_RX_CONTROL_IRQ_UNKNOWN_4        BIT(4) // rx0 only?
+#define DMA_RX_CONTROL_IRQ_DATA             BIT(5)
+
+#define DMA_RX_CONTROL2_UNKNOWN_5           BIT(5) // rx0 only?
+#define DMA_RX_CONTROL2_RESET               BIT(7) // resets ringbuffer indices
+
+#define DMA_RX_WRAP_FLAG                    BIT(7)
+
+#define DMA_RX_STATUS_ERROR                 BIT(3)
+#define DMA_RX_STATUS_UNKNOWN_4             BIT(4) // set in rx0 after using CONTROL_NRESET when it becomes possible to read config (can take >100ms)
+#define DMA_RX_STATUS_HAVE_DATA             BIT(5)
+#define DMA_RX_STATUS_ENABLED               BIT(8)
+
+#define COUNTER_RESET                       BIT(31)
+
+struct ithc_registers {
+	/* 0000 */ u32 _unknown_0000[1024];
+	/* 1000 */ u32 _unknown_1000;
+	/* 1004 */ u32 _unknown_1004;
+	/* 1008 */ u32 control_bits;
+	/* 100c */ u32 _unknown_100c;
+	/* 1010 */ u32 spi_config;
+	/* 1014 */ u32 _unknown_1014[3];
+	/* 1020 */ u32 error_control;
+	/* 1024 */ u32 error_status; // write to clear
+	/* 1028 */ u32 error_flags; // write to clear
+	/* 102c */ u32 _unknown_102c[5];
+	struct {
+		/* 1040 */ u8 control;
+		/* 1041 */ u8 code;
+		/* 1042 */ u16 size;
+		/* 1044 */ u32 status; // write to clear
+		/* 1048 */ u32 offset;
+		/* 104c */ u32 data[16];
+		/* 108c */ u32 _unknown_108c;
+	} spi_cmd;
+	struct {
+		/* 1090 */ u64 addr; // cannot be written with writeq(), must use lo_hi_writeq()
+		/* 1098 */ u8 control;
+		/* 1099 */ u8 _unknown_1099;
+		/* 109a */ u8 _unknown_109a;
+		/* 109b */ u8 num_prds;
+		/* 109c */ u32 status; // write to clear
+	} dma_tx;
+	/* 10a0 */ u32 _unknown_10a0[7];
+	/* 10bc */ u32 state; // is 0xe0000402 (dev config val 0) after CONTROL_NRESET, 0xe0000461 after first touch, 0xe0000401 after DMA_RX_CODE_RESET
+	/* 10c0 */ u32 _unknown_10c0[8];
+	/* 10e0 */ u32 _unknown_10e0_counters[3];
+	/* 10ec */ u32 _unknown_10ec[5];
+	struct {
+		/* 1100/1200 */ u64 addr; // cannot be written with writeq(), must use lo_hi_writeq()
+		/* 1108/1208 */ u8 num_bufs;
+		/* 1109/1209 */ u8 num_prds;
+		/* 110a/120a */ u16 _unknown_110a;
+		/* 110c/120c */ u8 control;
+		/* 110d/120d */ u8 head;
+		/* 110e/120e */ u8 tail;
+		/* 110f/120f */ u8 control2;
+		/* 1110/1210 */ u32 status; // write to clear
+		/* 1114/1214 */ u32 _unknown_1114;
+		/* 1118/1218 */ u64 _unknown_1118_guc_addr;
+		/* 1120/1220 */ u32 _unknown_1120_guc;
+		/* 1124/1224 */ u32 _unknown_1124_guc;
+		/* 1128/1228 */ u32 unknown_init_bits; // bit 2 = guc related, bit 3 = rx1 related, bit 4 = guc related
+		/* 112c/122c */ u32 _unknown_112c;
+		/* 1130/1230 */ u64 _unknown_1130_guc_addr;
+		/* 1138/1238 */ u32 _unknown_1138_guc;
+		/* 113c/123c */ u32 _unknown_113c;
+		/* 1140/1240 */ u32 _unknown_1140_guc;
+		/* 1144/1244 */ u32 _unknown_1144[23];
+		/* 11a0/12a0 */ u32 _unknown_11a0_counters[6];
+		/* 11b8/12b8 */ u32 _unknown_11b8[18];
+	} dma_rx[2];
+};
+static_assert(sizeof(struct ithc_registers) == 0x1300);
+
+#define DEVCFG_DMA_RX_SIZE(x)          ((((x) & 0x3fff) + 1) << 6)
+#define DEVCFG_DMA_TX_SIZE(x)          (((((x) >> 14) & 0x3ff) + 1) << 6)
+
+#define DEVCFG_TOUCH_MASK              0x3f
+#define DEVCFG_TOUCH_ENABLE            BIT(0)
+#define DEVCFG_TOUCH_UNKNOWN_1         BIT(1)
+#define DEVCFG_TOUCH_UNKNOWN_2         BIT(2)
+#define DEVCFG_TOUCH_UNKNOWN_3         BIT(3)
+#define DEVCFG_TOUCH_UNKNOWN_4         BIT(4)
+#define DEVCFG_TOUCH_UNKNOWN_5         BIT(5)
+#define DEVCFG_TOUCH_UNKNOWN_6         BIT(6)
+
+#define DEVCFG_DEVICE_ID_TIC           0x43495424 // "$TIC"
+
+#define DEVCFG_SPI_MAX_FREQ(x)         (((x) >> 1) & 0xf) // high bit = use high speed mode?
+#define DEVCFG_SPI_MODE(x)             (((x) >> 6) & 3)
+#define DEVCFG_SPI_UNKNOWN_8(x)        (((x) >> 8) & 0x3f)
+#define DEVCFG_SPI_NEEDS_HEARTBEAT     BIT(20)
+#define DEVCFG_SPI_HEARTBEAT_INTERVAL  (((x) >> 21) & 7)
+#define DEVCFG_SPI_UNKNOWN_25          BIT(25)
+#define DEVCFG_SPI_UNKNOWN_26          BIT(26)
+#define DEVCFG_SPI_UNKNOWN_27          BIT(27)
+#define DEVCFG_SPI_DELAY               (((x) >> 28) & 7)
+#define DEVCFG_SPI_USE_EXT_READ_CFG    BIT(31)
+
+struct ithc_device_config {
+	u32 _unknown_00;      // 00 = 0xe0000402 (0xe0000401 after DMA_RX_CODE_RESET)
+	u32 _unknown_04;      // 04 = 0x00000000
+	u32 dma_buf_sizes;    // 08 = 0x000a00ff
+	u32 touch_cfg;        // 0c = 0x0000001c
+	u32 _unknown_10;      // 10 = 0x0000001c
+	u32 device_id;        // 14 = 0x43495424 = "$TIC"
+	u32 spi_config;       // 18 = 0xfda00a2e
+	u16 vendor_id;        // 1c = 0x045e = Microsoft Corp.
+	u16 product_id;       // 1e = 0x0c1a
+	u32 revision;         // 20 = 0x00000001
+	u32 fw_version;       // 24 = 0x05008a8b = 5.0.138.139
+	u32 _unknown_28;      // 28 = 0x00000000
+	u32 fw_mode;          // 2c = 0x00000000
+	u32 _unknown_30;      // 30 = 0x00000000
+	u32 _unknown_34;      // 34 = 0x0404035e (u8,u8,u8,u8 = version?)
+	u32 _unknown_38;      // 38 = 0x000001c0 (0x000001c1 after DMA_RX_CODE_RESET)
+	u32 _unknown_3c;      // 3c = 0x00000002
+};
+
+void bitsl(__iomem u32 *reg, u32 mask, u32 val);
+void bitsb(__iomem u8 *reg, u8 mask, u8 val);
+#define bitsl_set(reg, x) bitsl(reg, x, x)
+#define bitsb_set(reg, x) bitsb(reg, x, x)
+int waitl(struct ithc *ithc, __iomem u32 *reg, u32 mask, u32 val);
+int waitb(struct ithc *ithc, __iomem u8 *reg, u8 mask, u8 val);
+int ithc_set_spi_config(struct ithc *ithc, u8 speed, u8 mode);
+int ithc_spi_command(struct ithc *ithc, u8 command, u32 offset, u32 size, void *data);
+
diff --git a/drivers/hid/ithc/ithc.h b/drivers/hid/ithc/ithc.h
new file mode 100644
index 000000000000..6a9b0d480bc1
--- /dev/null
+++ b/drivers/hid/ithc/ithc.h
@@ -0,0 +1,60 @@
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/hid.h>
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/pci.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/iopoll.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/miscdevice.h>
+#include <linux/debugfs.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/pm_qos.h>
+
+#define DEVNAME "ithc"
+#define DEVFULLNAME "Intel Touch Host Controller"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define CHECK(fn, ...) ({ int r = fn(__VA_ARGS__); if (r < 0) pci_err(ithc->pci, "%s: %s failed with %i\n", __func__, #fn, r); r; })
+#define CHECK_RET(...) do { int r = CHECK(__VA_ARGS__); if (r < 0) return r; } while(0)
+
+#define NUM_RX_BUF 16
+
+struct ithc;
+
+#include "ithc-regs.h"
+#include "ithc-dma.h"
+
+struct ithc {
+	char phys[32];
+	struct pci_dev *pci;
+	int irq;
+	struct task_struct *poll_thread;
+	struct pm_qos_request activity_qos;
+	struct timer_list activity_timer;
+
+	struct hid_device *hid;
+	bool hid_parse_done;
+	wait_queue_head_t wait_hid_parse;
+	wait_queue_head_t wait_hid_get_feature;
+	struct mutex hid_get_feature_mutex;
+	void *hid_get_feature_buf;
+	size_t hid_get_feature_size;
+
+	struct ithc_registers __iomem *regs;
+	struct ithc_registers *prev_regs; // for debugging
+	struct ithc_device_config config;
+	struct ithc_dma_rx dma_rx[2];
+	struct ithc_dma_tx dma_tx;
+};
+
+int ithc_reset(struct ithc *ithc);
+void ithc_set_active(struct ithc *ithc);
+int ithc_debug_init(struct ithc *ithc);
+void ithc_log_regs(struct ithc *ithc);
+
diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index d6037a328669..a290ebc77aea 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -628,6 +628,28 @@ static int acpi_gsb_i2c_write_bytes(struct i2c_client *client,
 	return (ret == 1) ? 0 : -EIO;
 }
 
+static int acpi_gsb_i2c_write_raw_bytes(struct i2c_client *client,
+		u8 *data, u8 data_len)
+{
+	struct i2c_msg msgs[1];
+	int ret = AE_OK;
+
+	msgs[0].addr = client->addr;
+	msgs[0].flags = client->flags;
+	msgs[0].len = data_len + 1;
+	msgs[0].buf = data;
+
+	ret = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+
+	if (ret < 0) {
+		dev_err(&client->adapter->dev, "i2c write failed: %d\n", ret);
+		return ret;
+	}
+
+	/* 1 transfer must have completed successfully */
+	return (ret == 1) ? 0 : -EIO;
+}
+
 static acpi_status
 i2c_acpi_space_handler(u32 function, acpi_physical_address command,
 			u32 bits, u64 *value64,
@@ -729,6 +751,19 @@ i2c_acpi_space_handler(u32 function, acpi_physical_address command,
 		}
 		break;
 
+	case ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES:
+		if (action == ACPI_READ) {
+			dev_warn(&adapter->dev,
+				 "protocol 0x%02x not supported for client 0x%02x\n",
+				 accessor_type, client->addr);
+			ret = AE_BAD_PARAMETER;
+			goto err;
+		} else {
+			status = acpi_gsb_i2c_write_raw_bytes(client,
+					gsb->data, info->access_length);
+		}
+		break;
+
 	default:
 		dev_warn(&adapter->dev, "protocol 0x%02x not supported for client 0x%02x\n",
 			 accessor_type, client->addr);
diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c
index 09489380afda..0f02411a60f1 100644
--- a/drivers/input/misc/soc_button_array.c
+++ b/drivers/input/misc/soc_button_array.c
@@ -507,8 +507,8 @@ static const struct soc_device_data soc_device_MSHW0028 = {
  * Both, the Surface Pro 4 (surfacepro3_button.c) and the above mentioned
  * devices use MSHW0040 for power and volume buttons, however the way they
  * have to be addressed differs. Make sure that we only load this drivers
- * for the correct devices by checking the OEM Platform Revision provided by
- * the _DSM method.
+ * for the correct devices by checking if the OEM Platform Revision DSM call
+ * exists.
  */
 #define MSHW0040_DSM_REVISION		0x01
 #define MSHW0040_DSM_GET_OMPR		0x02	// get OEM Platform Revision
@@ -519,31 +519,14 @@ static const guid_t MSHW0040_DSM_UUID =
 static int soc_device_check_MSHW0040(struct device *dev)
 {
 	acpi_handle handle = ACPI_HANDLE(dev);
-	union acpi_object *result;
-	u64 oem_platform_rev = 0;	// valid revisions are nonzero
-
-	// get OEM platform revision
-	result = acpi_evaluate_dsm_typed(handle, &MSHW0040_DSM_UUID,
-					 MSHW0040_DSM_REVISION,
-					 MSHW0040_DSM_GET_OMPR, NULL,
-					 ACPI_TYPE_INTEGER);
-
-	if (result) {
-		oem_platform_rev = result->integer.value;
-		ACPI_FREE(result);
-	}
-
-	/*
-	 * If the revision is zero here, the _DSM evaluation has failed. This
-	 * indicates that we have a Pro 4 or Book 1 and this driver should not
-	 * be used.
-	 */
-	if (oem_platform_rev == 0)
-		return -ENODEV;
+	bool exists;
 
-	dev_dbg(dev, "OEM Platform Revision %llu\n", oem_platform_rev);
+	// check if OEM platform revision DSM call exists
+	exists = acpi_check_dsm(handle, &MSHW0040_DSM_UUID,
+				MSHW0040_DSM_REVISION,
+				BIT(MSHW0040_DSM_GET_OMPR));
 
-	return 0;
+	return exists ? 0 : -ENODEV;
 }
 
 /*
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7c2f4bd33582..f8937235de4b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -37,6 +37,14 @@
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+#define IS_INTEL_IPU(pdev) ((pdev)->vendor == PCI_VENDOR_ID_INTEL &&	\
+			   ((pdev)->device == 0x9a19 ||		\
+			    (pdev)->device == 0x9a39 ||		\
+			    (pdev)->device == 0x4e19 ||		\
+			    (pdev)->device == 0x465d ||		\
+			    (pdev)->device == 0x1919))
+#define IS_IPTS(pdev) ((pdev)->vendor == PCI_VENDOR_ID_INTEL &&	\
+			    ((pdev)->device == 0x9d3e))
 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
 
 #define IOAPIC_RANGE_START	(0xfee00000)
@@ -287,12 +295,16 @@ int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
+static int dmar_map_ipts = 1;
+static int dmar_map_ipu = 1;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
 static int iommu_skip_te_disable;
 
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
+#define IDENTMAP_IPU		8
+#define IDENTMAP_IPTS		16
 
 const struct iommu_ops intel_iommu_ops;
 
@@ -2588,6 +2600,12 @@ static int device_def_domain_type(struct device *dev)
 
 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
 			return IOMMU_DOMAIN_IDENTITY;
+
+		if ((iommu_identity_mapping & IDENTMAP_IPU) && IS_INTEL_IPU(pdev))
+			return IOMMU_DOMAIN_IDENTITY;
+
+		if ((iommu_identity_mapping & IDENTMAP_IPTS) && IS_IPTS(pdev))
+			return IOMMU_DOMAIN_IDENTITY;
 	}
 
 	return 0;
@@ -2977,6 +2995,12 @@ static int __init init_dmars(void)
 	if (!dmar_map_gfx)
 		iommu_identity_mapping |= IDENTMAP_GFX;
 
+	if (!dmar_map_ipu)
+		iommu_identity_mapping |= IDENTMAP_IPU;
+
+	if (!dmar_map_ipts)
+		iommu_identity_mapping |= IDENTMAP_IPTS;
+
 	check_tylersburg_isoch();
 
 	ret = si_domain_init(hw_pass_through);
@@ -4819,6 +4843,30 @@ static void quirk_iommu_igfx(struct pci_dev *dev)
 	dmar_map_gfx = 0;
 }
 
+static void quirk_iommu_ipu(struct pci_dev *dev)
+{
+	if (!IS_INTEL_IPU(dev))
+		return;
+
+	if (risky_device(dev))
+		return;
+
+	pci_info(dev, "Passthrough IOMMU for integrated Intel IPU\n");
+	dmar_map_ipu = 0;
+}
+
+static void quirk_iommu_ipts(struct pci_dev *dev)
+{
+	if (!IS_IPTS(dev))
+		return;
+
+	if (risky_device(dev))
+		return;
+
+	pci_info(dev, "Passthrough IOMMU for IPTS\n");
+	dmar_map_ipts = 0;
+}
+
 /* G4x/GM45 integrated gfx dmar support is totally busted. */
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
@@ -4854,6 +4902,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
 
+/* disable IPU dmar support */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_iommu_ipu);
+
+/* disable IPTS dmar support */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9D3E, quirk_iommu_ipts);
+
 static void quirk_iommu_rwbf(struct pci_dev *dev)
 {
 	if (risky_device(dev))
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index df9e261af0b5..bc2a0aefedf2 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -390,6 +390,22 @@ static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
 	data.busmatch_count = 0;
 	pci_for_each_dma_alias(dev, set_msi_sid_cb, &data);
 
+	/*
+	 * The Intel Touch Host Controller is at 00:10.6, but for some reason
+	 * the MSI interrupts have request id 01:05.0.
+	 * Disable id verification to work around this.
+	 * FIXME Find proper fix or turn this into a quirk.
+	 */
+	if (dev->vendor == PCI_VENDOR_ID_INTEL && (dev->class >> 8) == PCI_CLASS_INPUT_PEN) {
+		switch(dev->device) {
+		case 0x98d0: case 0x98d1: // LKF
+		case 0xa0d0: case 0xa0d1: // TGL LP
+		case 0x43d0: case 0x43d1: // TGL H
+			set_irte_sid(irte, SVT_NO_VERIFY, SQ_ALL_16, 0);
+			return 0;
+		}
+	}
+
 	/*
 	 * DMA alias provides us with a PCI device and alias.  The only case
 	 * where the it will return an alias on a different bus than the
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index aaa9140bc351..1fb892c9acf1 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -827,6 +827,18 @@ config LEDS_TPS6105X
 	  It is a single boost converter primarily for white LEDs and
 	  audio amplifiers.
 
+config LEDS_TPS68470
+	tristate "LED support for TI TPS68470"
+	depends on LEDS_CLASS
+	depends on INTEL_SKL_INT3472
+	help
+	  This driver supports TPS68470 PMIC with LED chip.
+	  It provides two LED controllers, with the ability to drive 2
+	  indicator LEDs and 2 flash LEDs.
+
+	  To compile this driver as a module, choose M and it will be
+	  called leds-tps68470
+
 config LEDS_IP30
 	tristate "LED support for SGI Octane machines"
 	depends on LEDS_CLASS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index d30395d11fd8..515a69953e73 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_LEDS_TCA6507)		+= leds-tca6507.o
 obj-$(CONFIG_LEDS_TI_LMU_COMMON)	+= leds-ti-lmu-common.o
 obj-$(CONFIG_LEDS_TLC591XX)		+= leds-tlc591xx.o
 obj-$(CONFIG_LEDS_TPS6105X)		+= leds-tps6105x.o
+obj-$(CONFIG_LEDS_TPS68470)		+= leds-tps68470.o
 obj-$(CONFIG_LEDS_TURRIS_OMNIA)		+= leds-turris-omnia.o
 obj-$(CONFIG_LEDS_WM831X_STATUS)	+= leds-wm831x-status.o
 obj-$(CONFIG_LEDS_WM8350)		+= leds-wm8350.o
diff --git a/drivers/leds/leds-tps68470.c b/drivers/leds/leds-tps68470.c
new file mode 100644
index 000000000000..35aeb5db89c8
--- /dev/null
+++ b/drivers/leds/leds-tps68470.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * LED driver for TPS68470 PMIC
+ *
+ * Copyright (C) 2023 Red Hat
+ *
+ * Authors:
+ *	Kate Hsuan <hpa@redhat.com>
+ */
+
+#include <linux/leds.h>
+#include <linux/mfd/tps68470.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+
+
+#define lcdev_to_led(led_cdev) \
+	container_of(led_cdev, struct tps68470_led, lcdev)
+
+#define led_to_tps68470(led, index) \
+	container_of(led, struct tps68470_device, leds[index])
+
+enum tps68470_led_ids {
+	TPS68470_ILED_A,
+	TPS68470_ILED_B,
+	TPS68470_NUM_LEDS
+};
+
+static const char *tps68470_led_names[] = {
+	[TPS68470_ILED_A] = "tps68470-iled_a",
+	[TPS68470_ILED_B] = "tps68470-iled_b",
+};
+
+struct tps68470_led {
+	unsigned int led_id;
+	struct led_classdev lcdev;
+};
+
+struct tps68470_device {
+	struct device *dev;
+	struct regmap *regmap;
+	struct tps68470_led leds[TPS68470_NUM_LEDS];
+};
+
+enum ctrlb_current {
+	CTRLB_2MA	= 0,
+	CTRLB_4MA	= 1,
+	CTRLB_8MA	= 2,
+	CTRLB_16MA	= 3,
+};
+
+static int tps68470_brightness_set(struct led_classdev *led_cdev, enum led_brightness brightness)
+{
+	struct tps68470_led *led = lcdev_to_led(led_cdev);
+	struct tps68470_device *tps68470 = led_to_tps68470(led, led->led_id);
+	struct regmap *regmap = tps68470->regmap;
+
+	switch (led->led_id) {
+	case TPS68470_ILED_A:
+		return regmap_update_bits(regmap, TPS68470_REG_ILEDCTL, TPS68470_ILEDCTL_ENA,
+					  brightness ? TPS68470_ILEDCTL_ENA : 0);
+	case TPS68470_ILED_B:
+		return regmap_update_bits(regmap, TPS68470_REG_ILEDCTL, TPS68470_ILEDCTL_ENB,
+					  brightness ? TPS68470_ILEDCTL_ENB : 0);
+	}
+	return -EINVAL;
+}
+
+static enum led_brightness tps68470_brightness_get(struct led_classdev *led_cdev)
+{
+	struct tps68470_led *led = lcdev_to_led(led_cdev);
+	struct tps68470_device *tps68470 = led_to_tps68470(led, led->led_id);
+	struct regmap *regmap = tps68470->regmap;
+	int ret = 0;
+	int value = 0;
+
+	ret =  regmap_read(regmap, TPS68470_REG_ILEDCTL, &value);
+	if (ret)
+		return dev_err_probe(led_cdev->dev, -EINVAL, "failed on reading register\n");
+
+	switch (led->led_id) {
+	case TPS68470_ILED_A:
+		value = value & TPS68470_ILEDCTL_ENA;
+		break;
+	case TPS68470_ILED_B:
+		value = value & TPS68470_ILEDCTL_ENB;
+		break;
+	}
+
+	return value ? LED_ON : LED_OFF;
+}
+
+
+static int tps68470_ledb_current_init(struct platform_device *pdev,
+				      struct tps68470_device *tps68470)
+{
+	int ret = 0;
+	unsigned int curr;
+
+	/* configure LEDB current if the properties can be got */
+	if (!device_property_read_u32(&pdev->dev, "ti,ledb-current", &curr)) {
+		if (curr > CTRLB_16MA) {
+			dev_err(&pdev->dev,
+				"Invalid LEDB current value: %d\n",
+				curr);
+			return -EINVAL;
+		}
+		ret = regmap_update_bits(tps68470->regmap, TPS68470_REG_ILEDCTL,
+					 TPS68470_ILEDCTL_CTRLB, curr);
+	}
+	return ret;
+}
+
+static int tps68470_leds_probe(struct platform_device *pdev)
+{
+	int i = 0;
+	int ret = 0;
+	struct tps68470_device *tps68470;
+	struct tps68470_led *led;
+	struct led_classdev *lcdev;
+
+	tps68470 = devm_kzalloc(&pdev->dev, sizeof(struct tps68470_device),
+				GFP_KERNEL);
+	if (!tps68470)
+		return -ENOMEM;
+
+	tps68470->dev = &pdev->dev;
+	tps68470->regmap = dev_get_drvdata(pdev->dev.parent);
+
+	for (i = 0; i < TPS68470_NUM_LEDS; i++) {
+		led = &tps68470->leds[i];
+		lcdev = &led->lcdev;
+
+		led->led_id = i;
+
+		lcdev->name = devm_kasprintf(tps68470->dev, GFP_KERNEL, "%s::%s",
+					     tps68470_led_names[i], LED_FUNCTION_INDICATOR);
+		if (!lcdev->name)
+			return -ENOMEM;
+
+		lcdev->max_brightness = 1;
+		lcdev->brightness = 0;
+		lcdev->brightness_set_blocking = tps68470_brightness_set;
+		lcdev->brightness_get = tps68470_brightness_get;
+		lcdev->dev = &pdev->dev;
+
+		ret = devm_led_classdev_register(tps68470->dev, lcdev);
+		if (ret) {
+			dev_err_probe(tps68470->dev, ret,
+				      "error registering led\n");
+			goto err_exit;
+		}
+
+		if (i == TPS68470_ILED_B) {
+			ret = tps68470_ledb_current_init(pdev, tps68470);
+			if (ret)
+				goto err_exit;
+		}
+	}
+
+err_exit:
+	if (ret) {
+		for (i = 0; i < TPS68470_NUM_LEDS; i++) {
+			if (tps68470->leds[i].lcdev.name)
+				devm_led_classdev_unregister(&pdev->dev,
+							     &tps68470->leds[i].lcdev);
+		}
+	}
+
+	return ret;
+}
+static struct platform_driver tps68470_led_driver = {
+	.driver = {
+		   .name = "tps68470-led",
+	},
+	.probe = tps68470_leds_probe,
+};
+
+module_platform_driver(tps68470_led_driver);
+
+MODULE_ALIAS("platform:tps68470-led");
+MODULE_DESCRIPTION("LED driver for TPS68470 PMIC");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index c3d5952ca27e..dbc6d20efbe5 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -887,6 +887,17 @@ config VIDEO_DW9714
 	  capability. This is designed for linear control of
 	  voice coil motors, controlled via I2C serial interface.
 
+config VIDEO_DW9719
+	tristate "DW9719 lens voice coil support"
+	depends on I2C && VIDEO_DEV
+	select MEDIA_CONTROLLER
+	select VIDEO_V4L2_SUBDEV_API
+	select V4L2_ASYNC
+	help
+	  This is a driver for the DW9719 camera lens voice coil.
+	  This is designed for linear control of  voice coil motors,
+	  controlled via I2C serial interface.
+
 config VIDEO_DW9768
 	tristate "DW9768 lens voice coil support"
 	depends on I2C && VIDEO_DEV
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index 4f5e9d9cee85..eef3ee84d7dc 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_VIDEO_CS5345) += cs5345.o
 obj-$(CONFIG_VIDEO_CS53L32A) += cs53l32a.o
 obj-$(CONFIG_VIDEO_CX25840) += cx25840/
 obj-$(CONFIG_VIDEO_DW9714) += dw9714.o
+obj-$(CONFIG_VIDEO_DW9719) += dw9719.o
 obj-$(CONFIG_VIDEO_DW9768) += dw9768.o
 obj-$(CONFIG_VIDEO_DW9807_VCM) += dw9807-vcm.o
 obj-$(CONFIG_VIDEO_ET8EK8) += et8ek8/
diff --git a/drivers/media/i2c/dw9719.c b/drivers/media/i2c/dw9719.c
new file mode 100644
index 000000000000..180b04d2a6b3
--- /dev/null
+++ b/drivers/media/i2c/dw9719.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2012 Intel Corporation
+
+/*
+ * Based on linux/modules/camera/drivers/media/i2c/imx/dw9719.c in this repo:
+ * https://github.com/ZenfoneArea/android_kernel_asus_zenfone5
+ */
+
+#include <asm/unaligned.h>
+
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/pm_runtime.h>
+#include <linux/regulator/consumer.h>
+#include <linux/types.h>
+
+#include <media/v4l2-common.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-subdev.h>
+
+#define DW9719_MAX_FOCUS_POS	1023
+#define DW9719_CTRL_STEPS	16
+#define DW9719_CTRL_DELAY_US	1000
+#define DELAY_MAX_PER_STEP_NS	(1000000 * 1023)
+
+#define DW9719_INFO			0
+#define DW9719_ID			0xF1
+#define DW9719_CONTROL			2
+#define DW9719_VCM_CURRENT		3
+
+#define DW9719_MODE			6
+#define DW9719_VCM_FREQ			7
+
+#define DW9719_MODE_SAC3		0x40
+#define DW9719_DEFAULT_VCM_FREQ		0x60
+#define DW9719_ENABLE_RINGING		0x02
+
+#define NUM_REGULATORS			2
+
+#define to_dw9719_device(x) container_of(x, struct dw9719_device, sd)
+
+struct dw9719_device {
+	struct device *dev;
+	struct i2c_client *client;
+	struct regulator_bulk_data regulators[NUM_REGULATORS];
+	struct v4l2_subdev sd;
+
+	struct dw9719_v4l2_ctrls {
+		struct v4l2_ctrl_handler handler;
+		struct v4l2_ctrl *focus;
+	} ctrls;
+};
+
+static int dw9719_i2c_rd8(struct i2c_client *client, u8 reg, u8 *val)
+{
+	struct i2c_msg msg[2];
+	u8 buf[2] = { reg };
+	int ret;
+
+	msg[0].addr = client->addr;
+	msg[0].flags = 0;
+	msg[0].len = 1;
+	msg[0].buf = buf;
+
+	msg[1].addr = client->addr;
+	msg[1].flags = I2C_M_RD;
+	msg[1].len = 1;
+	msg[1].buf = &buf[1];
+	*val = 0;
+
+	ret = i2c_transfer(client->adapter, msg, 2);
+	if (ret < 0)
+		return ret;
+
+	*val = buf[1];
+
+	return 0;
+}
+
+static int dw9719_i2c_wr8(struct i2c_client *client, u8 reg, u8 val)
+{
+	struct i2c_msg msg;
+	int ret;
+
+	u8 buf[2] = { reg, val };
+
+	msg.addr = client->addr;
+	msg.flags = 0;
+	msg.len = sizeof(buf);
+	msg.buf = buf;
+
+	ret = i2c_transfer(client->adapter, &msg, 1);
+
+	return ret < 0 ? ret : 0;
+}
+
+static int dw9719_i2c_wr16(struct i2c_client *client, u8 reg, u16 val)
+{
+	struct i2c_msg msg;
+	u8 buf[3] = { reg };
+	int ret;
+
+	put_unaligned_be16(val, buf + 1);
+
+	msg.addr = client->addr;
+	msg.flags = 0;
+	msg.len = sizeof(buf);
+	msg.buf = buf;
+
+	ret = i2c_transfer(client->adapter, &msg, 1);
+
+	return ret < 0 ? ret : 0;
+}
+
+static int dw9719_detect(struct dw9719_device *dw9719)
+{
+	int ret;
+	u8 val;
+
+	ret = dw9719_i2c_rd8(dw9719->client, DW9719_INFO, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val != DW9719_ID) {
+		dev_err(dw9719->dev, "Failed to detect correct id\n");
+		ret = -ENXIO;
+	}
+
+	return 0;
+}
+
+static int dw9719_power_down(struct dw9719_device *dw9719)
+{
+	return regulator_bulk_disable(NUM_REGULATORS, dw9719->regulators);
+}
+
+static int dw9719_power_up(struct dw9719_device *dw9719)
+{
+	int ret;
+
+	ret = regulator_bulk_enable(NUM_REGULATORS, dw9719->regulators);
+	if (ret)
+		return ret;
+
+	/* Jiggle SCL pin to wake up device */
+	ret = dw9719_i2c_wr8(dw9719->client, DW9719_CONTROL, 1);
+
+	/* Need 100us to transit from SHUTDOWN to STANDBY*/
+	usleep_range(100, 1000);
+
+	ret = dw9719_i2c_wr8(dw9719->client, DW9719_CONTROL,
+			     DW9719_ENABLE_RINGING);
+	if (ret < 0)
+		goto fail_powerdown;
+
+	ret = dw9719_i2c_wr8(dw9719->client, DW9719_MODE, DW9719_MODE_SAC3);
+	if (ret < 0)
+		goto fail_powerdown;
+
+	ret = dw9719_i2c_wr8(dw9719->client, DW9719_VCM_FREQ,
+			     DW9719_DEFAULT_VCM_FREQ);
+	if (ret < 0)
+		goto fail_powerdown;
+
+	return 0;
+
+fail_powerdown:
+	dw9719_power_down(dw9719);
+	return ret;
+}
+
+static int dw9719_t_focus_abs(struct dw9719_device *dw9719, s32 value)
+{
+	int ret;
+
+	value = clamp(value, 0, DW9719_MAX_FOCUS_POS);
+	ret = dw9719_i2c_wr16(dw9719->client, DW9719_VCM_CURRENT, value);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int dw9719_set_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct dw9719_device *dw9719 = container_of(ctrl->handler,
+						    struct dw9719_device,
+						    ctrls.handler);
+	int ret;
+
+	/* Only apply changes to the controls if the device is powered up */
+	if (!pm_runtime_get_if_in_use(dw9719->dev))
+		return 0;
+
+	switch (ctrl->id) {
+	case V4L2_CID_FOCUS_ABSOLUTE:
+		ret = dw9719_t_focus_abs(dw9719, ctrl->val);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	pm_runtime_put(dw9719->dev);
+
+	return ret;
+}
+
+static const struct v4l2_ctrl_ops dw9719_ctrl_ops = {
+	.s_ctrl = dw9719_set_ctrl,
+};
+
+static int __maybe_unused dw9719_suspend(struct device *dev)
+{
+	struct v4l2_subdev *sd = dev_get_drvdata(dev);
+	struct dw9719_device *dw9719 = to_dw9719_device(sd);
+	int ret;
+	int val;
+
+	for (val = dw9719->ctrls.focus->val; val >= 0;
+	     val -= DW9719_CTRL_STEPS) {
+		ret = dw9719_t_focus_abs(dw9719, val);
+		if (ret)
+			return ret;
+
+		usleep_range(DW9719_CTRL_DELAY_US, DW9719_CTRL_DELAY_US + 10);
+	}
+
+	return dw9719_power_down(dw9719);
+}
+
+static int __maybe_unused dw9719_resume(struct device *dev)
+{
+	struct v4l2_subdev *sd = dev_get_drvdata(dev);
+	struct dw9719_device *dw9719 = to_dw9719_device(sd);
+	int current_focus = dw9719->ctrls.focus->val;
+	int ret;
+	int val;
+
+	ret = dw9719_power_up(dw9719);
+	if (ret)
+		return ret;
+
+	for (val = current_focus % DW9719_CTRL_STEPS; val < current_focus;
+	     val += DW9719_CTRL_STEPS) {
+		ret = dw9719_t_focus_abs(dw9719, val);
+		if (ret)
+			goto err_power_down;
+
+		usleep_range(DW9719_CTRL_DELAY_US, DW9719_CTRL_DELAY_US + 10);
+	}
+
+	return 0;
+
+err_power_down:
+	dw9719_power_down(dw9719);
+	return ret;
+}
+
+static int dw9719_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
+{
+	return pm_runtime_resume_and_get(sd->dev);
+}
+
+static int dw9719_close(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
+{
+	pm_runtime_put(sd->dev);
+
+	return 0;
+}
+
+static const struct v4l2_subdev_internal_ops dw9719_internal_ops = {
+	.open = dw9719_open,
+	.close = dw9719_close,
+};
+
+static int dw9719_init_controls(struct dw9719_device *dw9719)
+{
+	const struct v4l2_ctrl_ops *ops = &dw9719_ctrl_ops;
+	int ret;
+
+	ret = v4l2_ctrl_handler_init(&dw9719->ctrls.handler, 1);
+	if (ret)
+		return ret;
+
+	dw9719->ctrls.focus = v4l2_ctrl_new_std(&dw9719->ctrls.handler, ops,
+						V4L2_CID_FOCUS_ABSOLUTE, 0,
+						DW9719_MAX_FOCUS_POS, 1, 0);
+
+	if (dw9719->ctrls.handler.error) {
+		dev_err(dw9719->dev, "Error initialising v4l2 ctrls\n");
+		ret = dw9719->ctrls.handler.error;
+		goto err_free_handler;
+	}
+
+	dw9719->sd.ctrl_handler = &dw9719->ctrls.handler;
+
+	return ret;
+
+err_free_handler:
+	v4l2_ctrl_handler_free(&dw9719->ctrls.handler);
+	return ret;
+}
+
+static const struct v4l2_subdev_ops dw9719_ops = { };
+
+static int dw9719_probe(struct i2c_client *client)
+{
+	struct dw9719_device *dw9719;
+	int ret;
+
+	dw9719 = devm_kzalloc(&client->dev, sizeof(*dw9719), GFP_KERNEL);
+	if (!dw9719)
+		return -ENOMEM;
+
+	dw9719->client = client;
+	dw9719->dev = &client->dev;
+
+	dw9719->regulators[0].supply = "vdd";
+	/*
+	 * The DW9719 has only the 1 VDD voltage input, but some PMICs such as
+	 * the TPS68470 PMIC have I2C passthrough capability, to disconnect the
+	 * sensor's I2C pins from the I2C bus when the sensors VSIO (Sensor-IO)
+	 * is off, because some sensors then short these pins to ground;
+	 * and the DW9719 might sit behind this passthrough, this it needs to
+	 * enable VSIO as that will also enable the I2C passthrough.
+	 */
+	dw9719->regulators[1].supply = "vsio";
+
+	ret = devm_regulator_bulk_get(&client->dev, NUM_REGULATORS,
+				      dw9719->regulators);
+	if (ret)
+		return dev_err_probe(&client->dev, ret, "getting regulators\n");
+
+	v4l2_i2c_subdev_init(&dw9719->sd, client, &dw9719_ops);
+	dw9719->sd.flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
+	dw9719->sd.internal_ops = &dw9719_internal_ops;
+
+	ret = dw9719_init_controls(dw9719);
+	if (ret)
+		return ret;
+
+	ret = media_entity_pads_init(&dw9719->sd.entity, 0, NULL);
+	if (ret < 0)
+		goto err_free_ctrl_handler;
+
+	dw9719->sd.entity.function = MEDIA_ENT_F_LENS;
+
+	/*
+	 * We need the driver to work in the event that pm runtime is disable in
+	 * the kernel, so power up and verify the chip now. In the event that
+	 * runtime pm is disabled this will leave the chip on, so that the lens
+	 * will work.
+	 */
+
+	ret = dw9719_power_up(dw9719);
+	if (ret)
+		goto err_cleanup_media;
+
+	ret = dw9719_detect(dw9719);
+	if (ret)
+		goto err_powerdown;
+
+	pm_runtime_set_active(&client->dev);
+	pm_runtime_get_noresume(&client->dev);
+	pm_runtime_enable(&client->dev);
+
+	ret = v4l2_async_register_subdev(&dw9719->sd);
+	if (ret < 0)
+		goto err_pm_runtime;
+
+	pm_runtime_set_autosuspend_delay(&client->dev, 1000);
+	pm_runtime_use_autosuspend(&client->dev);
+	pm_runtime_put_autosuspend(&client->dev);
+
+	return ret;
+
+err_pm_runtime:
+	pm_runtime_disable(&client->dev);
+	pm_runtime_put_noidle(&client->dev);
+err_powerdown:
+	dw9719_power_down(dw9719);
+err_cleanup_media:
+	media_entity_cleanup(&dw9719->sd.entity);
+err_free_ctrl_handler:
+	v4l2_ctrl_handler_free(&dw9719->ctrls.handler);
+
+	return ret;
+}
+
+static void dw9719_remove(struct i2c_client *client)
+{
+	struct v4l2_subdev *sd = i2c_get_clientdata(client);
+	struct dw9719_device *dw9719 = container_of(sd, struct dw9719_device,
+						    sd);
+
+	pm_runtime_disable(&client->dev);
+	v4l2_async_unregister_subdev(sd);
+	v4l2_ctrl_handler_free(&dw9719->ctrls.handler);
+	media_entity_cleanup(&dw9719->sd.entity);
+}
+
+static const struct i2c_device_id dw9719_id_table[] = {
+	{ "dw9719" },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, dw9719_id_table);
+
+static const struct dev_pm_ops dw9719_pm_ops = {
+	SET_RUNTIME_PM_OPS(dw9719_suspend, dw9719_resume, NULL)
+};
+
+static struct i2c_driver dw9719_i2c_driver = {
+	.driver = {
+		.name = "dw9719",
+		.pm = &dw9719_pm_ops,
+	},
+	.probe_new = dw9719_probe,
+	.remove = dw9719_remove,
+	.id_table = dw9719_id_table,
+};
+module_i2c_driver(dw9719_i2c_driver);
+
+MODULE_AUTHOR("Daniel Scally <djrscally@gmail.com>");
+MODULE_DESCRIPTION("DW9719 VCM Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/media/i2c/ov7251.c b/drivers/media/i2c/ov7251.c
index 88e987435285..ff7b2c26da83 100644
--- a/drivers/media/i2c/ov7251.c
+++ b/drivers/media/i2c/ov7251.c
@@ -1051,7 +1051,7 @@ static int ov7251_s_ctrl(struct v4l2_ctrl *ctrl)
 	case V4L2_CID_EXPOSURE:
 		ret = ov7251_set_exposure(ov7251, ctrl->val);
 		break;
-	case V4L2_CID_GAIN:
+	case V4L2_CID_ANALOGUE_GAIN:
 		ret = ov7251_set_gain(ov7251, ctrl->val);
 		break;
 	case V4L2_CID_TEST_PATTERN:
@@ -1551,7 +1551,7 @@ static int ov7251_init_ctrls(struct ov7251 *ov7251)
 	ov7251->exposure = v4l2_ctrl_new_std(&ov7251->ctrls, &ov7251_ctrl_ops,
 					     V4L2_CID_EXPOSURE, 1, 32, 1, 32);
 	ov7251->gain = v4l2_ctrl_new_std(&ov7251->ctrls, &ov7251_ctrl_ops,
-					 V4L2_CID_GAIN, 16, 1023, 1, 16);
+					 V4L2_CID_ANALOGUE_GAIN, 16, 1023, 1, 16);
 	v4l2_ctrl_new_std_menu_items(&ov7251->ctrls, &ov7251_ctrl_ops,
 				     V4L2_CID_TEST_PATTERN,
 				     ARRAY_SIZE(ov7251_test_pattern_menu) - 1,
diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c b/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c
index 3b76a9d0383a..82681df7d794 100644
--- a/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c
+++ b/drivers/media/pci/intel/ipu3/ipu3-cio2-main.c
@@ -1383,7 +1383,10 @@ static int cio2_notifier_bound(struct v4l2_async_notifier *notifier,
 {
 	struct cio2_device *cio2 = to_cio2_device(notifier);
 	struct sensor_async_subdev *s_asd = to_sensor_asd(asd);
+	struct device *dev = &cio2->pci_dev->dev;
 	struct cio2_queue *q;
+	unsigned int pad;
+	int ret;
 
 	if (cio2->queue[s_asd->csi2.port].sensor)
 		return -EBUSY;
@@ -1394,7 +1397,26 @@ static int cio2_notifier_bound(struct v4l2_async_notifier *notifier,
 	q->sensor = sd;
 	q->csi_rx_base = cio2->base + CIO2_REG_PIPE_BASE(q->csi2.port);
 
-	return 0;
+	for (pad = 0; pad < q->sensor->entity.num_pads; pad++)
+		if (q->sensor->entity.pads[pad].flags &
+					MEDIA_PAD_FL_SOURCE)
+			break;
+
+	if (pad == q->sensor->entity.num_pads) {
+		dev_err(dev, "failed to find src pad for %s\n",
+			q->sensor->name);
+		return -ENXIO;
+	}
+
+	ret = media_create_pad_link(&q->sensor->entity, pad, &q->subdev.entity,
+				    CIO2_PAD_SINK, 0);
+	if (ret) {
+		dev_err(dev, "failed to create link for %s\n",
+			q->sensor->name);
+		return ret;
+	}
+
+	return v4l2_device_register_subdev_nodes(&cio2->v4l2_dev);
 }
 
 /* The .unbind callback */
@@ -1412,38 +1434,6 @@ static void cio2_notifier_unbind(struct v4l2_async_notifier *notifier,
 static int cio2_notifier_complete(struct v4l2_async_notifier *notifier)
 {
 	struct cio2_device *cio2 = to_cio2_device(notifier);
-	struct device *dev = &cio2->pci_dev->dev;
-	struct sensor_async_subdev *s_asd;
-	struct v4l2_async_subdev *asd;
-	struct cio2_queue *q;
-	unsigned int pad;
-	int ret;
-
-	list_for_each_entry(asd, &cio2->notifier.asd_list, asd_list) {
-		s_asd = to_sensor_asd(asd);
-		q = &cio2->queue[s_asd->csi2.port];
-
-		for (pad = 0; pad < q->sensor->entity.num_pads; pad++)
-			if (q->sensor->entity.pads[pad].flags &
-						MEDIA_PAD_FL_SOURCE)
-				break;
-
-		if (pad == q->sensor->entity.num_pads) {
-			dev_err(dev, "failed to find src pad for %s\n",
-				q->sensor->name);
-			return -ENXIO;
-		}
-
-		ret = media_create_pad_link(
-				&q->sensor->entity, pad,
-				&q->subdev.entity, CIO2_PAD_SINK,
-				0);
-		if (ret) {
-			dev_err(dev, "failed to create link for %s\n",
-				q->sensor->name);
-			return ret;
-		}
-	}
 
 	return v4l2_device_register_subdev_nodes(&cio2->v4l2_dev);
 }
diff --git a/drivers/media/v4l2-core/v4l2-async.c b/drivers/media/v4l2-core/v4l2-async.c
index b16b5f4cb91e..33739a979cbc 100644
--- a/drivers/media/v4l2-core/v4l2-async.c
+++ b/drivers/media/v4l2-core/v4l2-async.c
@@ -760,6 +760,10 @@ int v4l2_async_register_subdev(struct v4l2_subdev *sd)
 	struct v4l2_async_notifier *notifier;
 	int ret;
 
+	ret = v4l2_subdev_get_privacy_led(sd);
+	if (ret < 0)
+		return ret;
+
 	/*
 	 * No reference taken. The reference is held by the device
 	 * (struct v4l2_subdev.dev), and async sub-device does not
diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c
index 049c2f2001ea..f8c3e40b2b71 100644
--- a/drivers/media/v4l2-core/v4l2-fwnode.c
+++ b/drivers/media/v4l2-core/v4l2-fwnode.c
@@ -1304,10 +1304,6 @@ int v4l2_async_register_subdev_sensor(struct v4l2_subdev *sd)
 
 	v4l2_async_nf_init(notifier);
 
-	ret = v4l2_subdev_get_privacy_led(sd);
-	if (ret < 0)
-		goto out_cleanup;
-
 	ret = v4l2_async_nf_parse_fwnode_sensor(sd->dev, notifier);
 	if (ret < 0)
 		goto out_cleanup;
diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h
index bdc65d50b945..08723c01d727 100644
--- a/drivers/misc/mei/hw-me-regs.h
+++ b/drivers/misc/mei/hw-me-regs.h
@@ -92,6 +92,7 @@
 #define MEI_DEV_ID_CDF        0x18D3  /* Cedar Fork */
 
 #define MEI_DEV_ID_ICP_LP     0x34E0  /* Ice Lake Point LP */
+#define MEI_DEV_ID_ICP_LP_3   0x34E4  /* Ice Lake Point LP 3 (iTouch) */
 #define MEI_DEV_ID_ICP_N      0x38E0  /* Ice Lake Point N */
 
 #define MEI_DEV_ID_JSP_N      0x4DE0  /* Jasper Lake Point N */
diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c
index 5bf0d50d55a0..c13864512229 100644
--- a/drivers/misc/mei/pci-me.c
+++ b/drivers/misc/mei/pci-me.c
@@ -97,6 +97,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = {
 	{MEI_PCI_DEVICE(MEI_DEV_ID_CMP_H_3, MEI_ME_PCH8_ITOUCH_CFG)},
 
 	{MEI_PCI_DEVICE(MEI_DEV_ID_ICP_LP, MEI_ME_PCH12_CFG)},
+	{MEI_PCI_DEVICE(MEI_DEV_ID_ICP_LP_3, MEI_ME_PCH12_CFG)},
 	{MEI_PCI_DEVICE(MEI_DEV_ID_ICP_N, MEI_ME_PCH12_CFG)},
 
 	{MEI_PCI_DEVICE(MEI_DEV_ID_TGP_LP, MEI_ME_PCH15_CFG)},
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 5eb131ab916f..67f074a126d1 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -38,6 +38,9 @@ static bool fw_diag_log;
 /* frame mode values are mapped as per enum ath10k_hw_txrx_mode */
 unsigned int ath10k_frame_mode = ATH10K_HW_TXRX_NATIVE_WIFI;
 
+static char *override_board = "";
+static char *override_board2 = "";
+
 unsigned long ath10k_coredump_mask = BIT(ATH10K_FW_CRASH_DUMP_REGISTERS) |
 				     BIT(ATH10K_FW_CRASH_DUMP_CE_DATA);
 
@@ -50,6 +53,9 @@ module_param(fw_diag_log, bool, 0644);
 module_param_named(frame_mode, ath10k_frame_mode, uint, 0644);
 module_param_named(coredump_mask, ath10k_coredump_mask, ulong, 0444);
 
+module_param(override_board, charp, 0644);
+module_param(override_board2, charp, 0644);
+
 MODULE_PARM_DESC(debug_mask, "Debugging mask");
 MODULE_PARM_DESC(uart_print, "Uart target debugging");
 MODULE_PARM_DESC(skip_otp, "Skip otp failure for calibration in testmode");
@@ -59,6 +65,9 @@ MODULE_PARM_DESC(frame_mode,
 MODULE_PARM_DESC(coredump_mask, "Bitfield of what to include in firmware crash file");
 MODULE_PARM_DESC(fw_diag_log, "Diag based fw log debugging");
 
+MODULE_PARM_DESC(override_board, "Override for board.bin file");
+MODULE_PARM_DESC(override_board2, "Override for board-2.bin file");
+
 static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 	{
 		.id = QCA988X_HW_2_0_VERSION,
@@ -911,6 +920,42 @@ static int ath10k_init_configure_target(struct ath10k *ar)
 	return 0;
 }
 
+static const char *ath10k_override_board_fw_file(struct ath10k *ar,
+						 const char *file)
+{
+	if (strcmp(file, "board.bin") == 0) {
+		if (strcmp(override_board, "") == 0)
+			return file;
+
+		if (strcmp(override_board, "none") == 0) {
+			dev_info(ar->dev, "firmware override: pretending 'board.bin' does not exist\n");
+			return NULL;
+		}
+
+		dev_info(ar->dev, "firmware override: replacing 'board.bin' with '%s'\n",
+			 override_board);
+
+		return override_board;
+	}
+
+	if (strcmp(file, "board-2.bin") == 0) {
+		if (strcmp(override_board2, "") == 0)
+			return file;
+
+		if (strcmp(override_board2, "none") == 0) {
+			dev_info(ar->dev, "firmware override: pretending 'board-2.bin' does not exist\n");
+			return NULL;
+		}
+
+		dev_info(ar->dev, "firmware override: replacing 'board-2.bin' with '%s'\n",
+			 override_board2);
+
+		return override_board2;
+	}
+
+	return file;
+}
+
 static const struct firmware *ath10k_fetch_fw_file(struct ath10k *ar,
 						   const char *dir,
 						   const char *file)
@@ -925,6 +970,19 @@ static const struct firmware *ath10k_fetch_fw_file(struct ath10k *ar,
 	if (dir == NULL)
 		dir = ".";
 
+	/* HACK: Override board.bin and board-2.bin files if specified.
+	 *
+	 * Some Surface devices perform better with a different board
+	 * configuration. To this end, one would need to replace the board.bin
+	 * file with the modified config and remove the board-2.bin file.
+	 * Unfortunately, that's not a solution that we can easily package. So
+	 * we add module options to perform these overrides here.
+	 */
+
+	file = ath10k_override_board_fw_file(ar, file);
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
 	snprintf(filename, sizeof(filename), "%s/%s", dir, file);
 	ret = firmware_request_nowarn(&fw, filename, ar->dev);
 	ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot fw request '%s': %d\n",
diff --git a/drivers/net/wireless/marvell/mwifiex/pcie.c b/drivers/net/wireless/marvell/mwifiex/pcie.c
index 9a698a16a8f3..5e1a341f63df 100644
--- a/drivers/net/wireless/marvell/mwifiex/pcie.c
+++ b/drivers/net/wireless/marvell/mwifiex/pcie.c
@@ -368,6 +368,7 @@ static int mwifiex_pcie_probe(struct pci_dev *pdev,
 					const struct pci_device_id *ent)
 {
 	struct pcie_service_card *card;
+	struct pci_dev *parent_pdev = pci_upstream_bridge(pdev);
 	int ret;
 
 	pr_debug("info: vendor=0x%4.04X device=0x%4.04X rev=%d\n",
@@ -409,6 +410,12 @@ static int mwifiex_pcie_probe(struct pci_dev *pdev,
 		return -1;
 	}
 
+	/* disable bridge_d3 for Surface gen4+ devices to fix fw crashing
+	 * after suspend
+	 */
+	if (card->quirks & QUIRK_NO_BRIDGE_D3)
+		parent_pdev->bridge_d3 = false;
+
 	return 0;
 }
 
@@ -1762,9 +1769,21 @@ mwifiex_pcie_send_boot_cmd(struct mwifiex_adapter *adapter, struct sk_buff *skb)
 static int mwifiex_pcie_init_fw_port(struct mwifiex_adapter *adapter)
 {
 	struct pcie_service_card *card = adapter->card;
+	struct pci_dev *pdev = card->dev;
+	struct pci_dev *parent_pdev = pci_upstream_bridge(pdev);
 	const struct mwifiex_pcie_card_reg *reg = card->pcie.reg;
 	int tx_wrap = card->txbd_wrptr & reg->tx_wrap_mask;
 
+	/* Trigger a function level reset of the PCI bridge device, this makes
+	 * the firmware of PCIe 88W8897 cards stop reporting a fixed LTR value
+	 * that prevents the system from entering package C10 and S0ix powersaving
+	 * states.
+	 * We need to do it here because it must happen after firmware
+	 * initialization and this function is called after that is done.
+	 */
+	if (card->quirks & QUIRK_DO_FLR_ON_BRIDGE)
+		pci_reset_function(parent_pdev);
+
 	/* Write the RX ring read pointer in to reg->rx_rdptr */
 	if (mwifiex_write_reg(adapter, reg->rx_rdptr, card->rxbd_rdptr |
 			      tx_wrap)) {
diff --git a/drivers/net/wireless/marvell/mwifiex/pcie_quirks.c b/drivers/net/wireless/marvell/mwifiex/pcie_quirks.c
index dd6d21f1dbfd..99b024ecbade 100644
--- a/drivers/net/wireless/marvell/mwifiex/pcie_quirks.c
+++ b/drivers/net/wireless/marvell/mwifiex/pcie_quirks.c
@@ -13,7 +13,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Pro 4"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Pro 5",
@@ -22,7 +24,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "Surface_Pro_1796"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Pro 5 (LTE)",
@@ -31,7 +35,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "Surface_Pro_1807"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Pro 6",
@@ -39,7 +45,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Pro 6"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Book 1",
@@ -47,7 +55,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Book"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Book 2",
@@ -55,7 +65,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Book 2"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Laptop 1",
@@ -63,7 +75,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Laptop"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{
 		.ident = "Surface Laptop 2",
@@ -71,7 +85,9 @@ static const struct dmi_system_id mwifiex_quirk_table[] = {
 			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
 			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Laptop 2"),
 		},
-		.driver_data = (void *)QUIRK_FW_RST_D3COLD,
+		.driver_data = (void *)(QUIRK_FW_RST_D3COLD |
+					QUIRK_DO_FLR_ON_BRIDGE |
+					QUIRK_NO_BRIDGE_D3),
 	},
 	{}
 };
@@ -89,6 +105,11 @@ void mwifiex_initialize_quirks(struct pcie_service_card *card)
 		dev_info(&pdev->dev, "no quirks enabled\n");
 	if (card->quirks & QUIRK_FW_RST_D3COLD)
 		dev_info(&pdev->dev, "quirk reset_d3cold enabled\n");
+	if (card->quirks & QUIRK_DO_FLR_ON_BRIDGE)
+		dev_info(&pdev->dev, "quirk do_flr_on_bridge enabled\n");
+	if (card->quirks & QUIRK_NO_BRIDGE_D3)
+		dev_info(&pdev->dev,
+			 "quirk no_brigde_d3 enabled\n");
 }
 
 static void mwifiex_pcie_set_power_d3cold(struct pci_dev *pdev)
diff --git a/drivers/net/wireless/marvell/mwifiex/pcie_quirks.h b/drivers/net/wireless/marvell/mwifiex/pcie_quirks.h
index d6ff964aec5b..c14eb56eb911 100644
--- a/drivers/net/wireless/marvell/mwifiex/pcie_quirks.h
+++ b/drivers/net/wireless/marvell/mwifiex/pcie_quirks.h
@@ -4,6 +4,8 @@
 #include "pcie.h"
 
 #define QUIRK_FW_RST_D3COLD	BIT(0)
+#define QUIRK_DO_FLR_ON_BRIDGE	BIT(1)
+#define QUIRK_NO_BRIDGE_D3	BIT(2)
 
 void mwifiex_initialize_quirks(struct pcie_service_card *card);
 int mwifiex_pcie_reset_d3cold_quirk(struct pci_dev *pdev);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 57ddcc59af30..497cbadd2c6c 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -507,6 +507,9 @@ static void pci_device_shutdown(struct device *dev)
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	struct pci_driver *drv = pci_dev->driver;
 
+	if (pci_dev->no_shutdown)
+		return;
+
 	pm_runtime_resume(dev);
 
 	if (drv && drv->shutdown)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 9232b0fcb945..a77a48220803 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -6142,3 +6142,39 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a2d, dpc_log_size);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a2f, dpc_log_size);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a31, dpc_log_size);
 #endif
+
+static const struct dmi_system_id no_shutdown_dmi_table[] = {
+	/*
+	 * Systems on which some devices should not be touched during shutdown.
+	 */
+	{
+		.ident = "Microsoft Surface Pro 9",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Surface Pro 9"),
+		},
+	},
+	{
+		.ident = "Microsoft Surface Laptop 5",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Surface Laptop 5"),
+		},
+	},
+	{}
+};
+
+static void quirk_no_shutdown(struct pci_dev *dev)
+{
+	if (!dmi_check_system(no_shutdown_dmi_table))
+		return;
+
+	dev->no_shutdown = 1;
+	pci_info(dev, "disabling shutdown ops for [%04x:%04x]\n",
+		 dev->vendor, dev->device);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x461e, quirk_no_shutdown);  // Thunderbolt 4 USB Controller
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x461f, quirk_no_shutdown);  // Thunderbolt 4 PCI Express Root Port
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x462f, quirk_no_shutdown);  // Thunderbolt 4 PCI Express Root Port
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x466d, quirk_no_shutdown);  // Thunderbolt 4 NHI
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x46a8, quirk_no_shutdown);  // GPU
diff --git a/drivers/platform/surface/Kconfig b/drivers/platform/surface/Kconfig
index b629e82af97c..68656e8f309e 100644
--- a/drivers/platform/surface/Kconfig
+++ b/drivers/platform/surface/Kconfig
@@ -149,6 +149,13 @@ config SURFACE_AGGREGATOR_TABLET_SWITCH
 	  Select M or Y here, if you want to provide tablet-mode switch input
 	  events on the Surface Pro 8, Surface Pro X, and Surface Laptop Studio.
 
+config SURFACE_BOOK1_DGPU_SWITCH
+	tristate "Surface Book 1 dGPU Switch Driver"
+	depends on SYSFS
+	help
+	  This driver provides a sysfs switch to set the power-state of the
+	  discrete GPU found on the Microsoft Surface Book 1.
+
 config SURFACE_DTX
 	tristate "Surface DTX (Detachment System) Driver"
 	depends on SURFACE_AGGREGATOR
diff --git a/drivers/platform/surface/Makefile b/drivers/platform/surface/Makefile
index 53344330939b..7efcd0cdb532 100644
--- a/drivers/platform/surface/Makefile
+++ b/drivers/platform/surface/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_SURFACE_AGGREGATOR_CDEV)	+= surface_aggregator_cdev.o
 obj-$(CONFIG_SURFACE_AGGREGATOR_HUB)	+= surface_aggregator_hub.o
 obj-$(CONFIG_SURFACE_AGGREGATOR_REGISTRY) += surface_aggregator_registry.o
 obj-$(CONFIG_SURFACE_AGGREGATOR_TABLET_SWITCH) += surface_aggregator_tabletsw.o
+obj-$(CONFIG_SURFACE_BOOK1_DGPU_SWITCH) += surfacebook1_dgpu_switch.o
 obj-$(CONFIG_SURFACE_DTX)		+= surface_dtx.o
 obj-$(CONFIG_SURFACE_GPE)		+= surface_gpe.o
 obj-$(CONFIG_SURFACE_HOTPLUG)		+= surface_hotplug.o
diff --git a/drivers/platform/surface/surface3-wmi.c b/drivers/platform/surface/surface3-wmi.c
index ca4602bcc7de..490b9731068a 100644
--- a/drivers/platform/surface/surface3-wmi.c
+++ b/drivers/platform/surface/surface3-wmi.c
@@ -37,6 +37,13 @@ static const struct dmi_system_id surface3_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "Surface 3"),
 		},
 	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+			DMI_MATCH(DMI_SYS_VENDOR, "OEMB"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OEMB"),
+		},
+	},
 #endif
 	{ }
 };
diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c
index 296f72d52e6a..0fe5be539652 100644
--- a/drivers/platform/surface/surface_aggregator_registry.c
+++ b/drivers/platform/surface/surface_aggregator_registry.c
@@ -305,7 +305,7 @@ static const struct software_node *ssam_node_group_sp9[] = {
 	&ssam_node_bat_ac,
 	&ssam_node_bat_main,
 	&ssam_node_tmp_pprof,
-	/* TODO: Tablet mode switch (via POS subsystem) */
+	&ssam_node_pos_tablet_switch,
 	&ssam_node_hid_kip_keyboard,
 	&ssam_node_hid_kip_penstash,
 	&ssam_node_hid_kip_touchpad,
diff --git a/drivers/platform/surface/surface_aggregator_tabletsw.c b/drivers/platform/surface/surface_aggregator_tabletsw.c
index a18e9fc7896b..c0a1a5869246 100644
--- a/drivers/platform/surface/surface_aggregator_tabletsw.c
+++ b/drivers/platform/surface/surface_aggregator_tabletsw.c
@@ -20,16 +20,23 @@
 
 struct ssam_tablet_sw;
 
+struct ssam_tablet_sw_state {
+	u32 source;
+	u32 state;
+};
+
 struct ssam_tablet_sw_ops {
-	int (*get_state)(struct ssam_tablet_sw *sw, u32 *state);
-	const char *(*state_name)(struct ssam_tablet_sw *sw, u32 state);
-	bool (*state_is_tablet_mode)(struct ssam_tablet_sw *sw, u32 state);
+	int (*get_state)(struct ssam_tablet_sw *sw, struct ssam_tablet_sw_state *state);
+	const char *(*state_name)(struct ssam_tablet_sw *sw,
+				  const struct ssam_tablet_sw_state *state);
+	bool (*state_is_tablet_mode)(struct ssam_tablet_sw *sw,
+				     const struct ssam_tablet_sw_state *state);
 };
 
 struct ssam_tablet_sw {
 	struct ssam_device *sdev;
 
-	u32 state;
+	struct ssam_tablet_sw_state state;
 	struct work_struct update_work;
 	struct input_dev *mode_switch;
 
@@ -45,9 +52,11 @@ struct ssam_tablet_sw_desc {
 
 	struct {
 		u32 (*notify)(struct ssam_event_notifier *nf, const struct ssam_event *event);
-		int (*get_state)(struct ssam_tablet_sw *sw, u32 *state);
-		const char *(*state_name)(struct ssam_tablet_sw *sw, u32 state);
-		bool (*state_is_tablet_mode)(struct ssam_tablet_sw *sw, u32 state);
+		int (*get_state)(struct ssam_tablet_sw *sw, struct ssam_tablet_sw_state *state);
+		const char *(*state_name)(struct ssam_tablet_sw *sw,
+					  const struct ssam_tablet_sw_state *state);
+		bool (*state_is_tablet_mode)(struct ssam_tablet_sw *sw,
+					     const struct ssam_tablet_sw_state *state);
 	} ops;
 
 	struct {
@@ -61,7 +70,7 @@ struct ssam_tablet_sw_desc {
 static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct ssam_tablet_sw *sw = dev_get_drvdata(dev);
-	const char *state = sw->ops.state_name(sw, sw->state);
+	const char *state = sw->ops.state_name(sw, &sw->state);
 
 	return sysfs_emit(buf, "%s\n", state);
 }
@@ -79,19 +88,19 @@ static const struct attribute_group ssam_tablet_sw_group = {
 static void ssam_tablet_sw_update_workfn(struct work_struct *work)
 {
 	struct ssam_tablet_sw *sw = container_of(work, struct ssam_tablet_sw, update_work);
+	struct ssam_tablet_sw_state state;
 	int tablet, status;
-	u32 state;
 
 	status = sw->ops.get_state(sw, &state);
 	if (status)
 		return;
 
-	if (sw->state == state)
+	if (sw->state.source == state.source && sw->state.state == state.state)
 		return;
 	sw->state = state;
 
 	/* Send SW_TABLET_MODE event. */
-	tablet = sw->ops.state_is_tablet_mode(sw, state);
+	tablet = sw->ops.state_is_tablet_mode(sw, &state);
 	input_report_switch(sw->mode_switch, SW_TABLET_MODE, tablet);
 	input_sync(sw->mode_switch);
 }
@@ -146,7 +155,7 @@ static int ssam_tablet_sw_probe(struct ssam_device *sdev)
 	sw->mode_switch->id.bustype = BUS_HOST;
 	sw->mode_switch->dev.parent = &sdev->dev;
 
-	tablet = sw->ops.state_is_tablet_mode(sw, sw->state);
+	tablet = sw->ops.state_is_tablet_mode(sw, &sw->state);
 	input_set_capability(sw->mode_switch, EV_SW, SW_TABLET_MODE);
 	input_report_switch(sw->mode_switch, SW_TABLET_MODE, tablet);
 
@@ -204,9 +213,10 @@ enum ssam_kip_cover_state {
 	SSAM_KIP_COVER_STATE_BOOK          = 0x06,
 };
 
-static const char *ssam_kip_cover_state_name(struct ssam_tablet_sw *sw, u32 state)
+static const char *ssam_kip_cover_state_name(struct ssam_tablet_sw *sw,
+					     const struct ssam_tablet_sw_state *state)
 {
-	switch (state) {
+	switch (state->state) {
 	case SSAM_KIP_COVER_STATE_DISCONNECTED:
 		return "disconnected";
 
@@ -226,14 +236,15 @@ static const char *ssam_kip_cover_state_name(struct ssam_tablet_sw *sw, u32 stat
 		return "book";
 
 	default:
-		dev_warn(&sw->sdev->dev, "unknown KIP cover state: %u\n", state);
+		dev_warn(&sw->sdev->dev, "unknown KIP cover state: %u\n", state->state);
 		return "<unknown>";
 	}
 }
 
-static bool ssam_kip_cover_state_is_tablet_mode(struct ssam_tablet_sw *sw, u32 state)
+static bool ssam_kip_cover_state_is_tablet_mode(struct ssam_tablet_sw *sw,
+						const struct ssam_tablet_sw_state *state)
 {
-	switch (state) {
+	switch (state->state) {
 	case SSAM_KIP_COVER_STATE_DISCONNECTED:
 	case SSAM_KIP_COVER_STATE_FOLDED_CANVAS:
 	case SSAM_KIP_COVER_STATE_FOLDED_BACK:
@@ -245,7 +256,7 @@ static bool ssam_kip_cover_state_is_tablet_mode(struct ssam_tablet_sw *sw, u32 s
 		return false;
 
 	default:
-		dev_warn(&sw->sdev->dev, "unknown KIP cover state: %d\n", sw->state);
+		dev_warn(&sw->sdev->dev, "unknown KIP cover state: %d\n", state->state);
 		return true;
 	}
 }
@@ -257,7 +268,7 @@ SSAM_DEFINE_SYNC_REQUEST_R(__ssam_kip_get_cover_state, u8, {
 	.instance_id     = 0x00,
 });
 
-static int ssam_kip_get_cover_state(struct ssam_tablet_sw *sw, u32 *state)
+static int ssam_kip_get_cover_state(struct ssam_tablet_sw *sw, struct ssam_tablet_sw_state *state)
 {
 	int status;
 	u8 raw;
@@ -268,7 +279,8 @@ static int ssam_kip_get_cover_state(struct ssam_tablet_sw *sw, u32 *state)
 		return status;
 	}
 
-	*state = raw;
+	state->source = 0;	/* Unused for KIP switch. */
+	state->state = raw;
 	return 0;
 }
 
@@ -317,11 +329,25 @@ MODULE_PARM_DESC(tablet_mode_in_slate_state, "Enable tablet mode in slate device
 #define SSAM_EVENT_POS_CID_POSTURE_CHANGED	0x03
 #define SSAM_POS_MAX_SOURCES			4
 
-enum ssam_pos_state {
-	SSAM_POS_POSTURE_LID_CLOSED = 0x00,
-	SSAM_POS_POSTURE_LAPTOP     = 0x01,
-	SSAM_POS_POSTURE_SLATE      = 0x02,
-	SSAM_POS_POSTURE_TABLET     = 0x03,
+enum ssam_pos_source_id {
+	SSAM_POS_SOURCE_COVER = 0x00,
+	SSAM_POS_SOURCE_SLS   = 0x03,
+};
+
+enum ssam_pos_state_cover {
+	SSAM_POS_COVER_DISCONNECTED  = 0x01,
+	SSAM_POS_COVER_CLOSED        = 0x02,
+	SSAM_POS_COVER_LAPTOP        = 0x03,
+	SSAM_POS_COVER_FOLDED_CANVAS = 0x04,
+	SSAM_POS_COVER_FOLDED_BACK   = 0x05,
+	SSAM_POS_COVER_BOOK          = 0x06,
+};
+
+enum ssam_pos_state_sls {
+	SSAM_POS_SLS_LID_CLOSED = 0x00,
+	SSAM_POS_SLS_LAPTOP     = 0x01,
+	SSAM_POS_SLS_SLATE      = 0x02,
+	SSAM_POS_SLS_TABLET     = 0x03,
 };
 
 struct ssam_sources_list {
@@ -329,42 +355,120 @@ struct ssam_sources_list {
 	__le32 id[SSAM_POS_MAX_SOURCES];
 } __packed;
 
-static const char *ssam_pos_state_name(struct ssam_tablet_sw *sw, u32 state)
+static const char *ssam_pos_state_name_cover(struct ssam_tablet_sw *sw, u32 state)
 {
 	switch (state) {
-	case SSAM_POS_POSTURE_LID_CLOSED:
+	case SSAM_POS_COVER_DISCONNECTED:
+		return "disconnected";
+
+	case SSAM_POS_COVER_CLOSED:
 		return "closed";
 
-	case SSAM_POS_POSTURE_LAPTOP:
+	case SSAM_POS_COVER_LAPTOP:
 		return "laptop";
 
-	case SSAM_POS_POSTURE_SLATE:
+	case SSAM_POS_COVER_FOLDED_CANVAS:
+		return "folded-canvas";
+
+	case SSAM_POS_COVER_FOLDED_BACK:
+		return "folded-back";
+
+	case SSAM_POS_COVER_BOOK:
+		return "book";
+
+	default:
+		dev_warn(&sw->sdev->dev, "unknown device posture for type-cover: %u\n", state);
+		return "<unknown>";
+	}
+}
+
+static const char *ssam_pos_state_name_sls(struct ssam_tablet_sw *sw, u32 state)
+{
+	switch (state) {
+	case SSAM_POS_SLS_LID_CLOSED:
+		return "closed";
+
+	case SSAM_POS_SLS_LAPTOP:
+		return "laptop";
+
+	case SSAM_POS_SLS_SLATE:
 		return "slate";
 
-	case SSAM_POS_POSTURE_TABLET:
+	case SSAM_POS_SLS_TABLET:
 		return "tablet";
 
 	default:
-		dev_warn(&sw->sdev->dev, "unknown device posture: %u\n", state);
+		dev_warn(&sw->sdev->dev, "unknown device posture for SLS: %u\n", state);
+		return "<unknown>";
+	}
+}
+
+static const char *ssam_pos_state_name(struct ssam_tablet_sw *sw,
+				       const struct ssam_tablet_sw_state *state)
+{
+	switch (state->source) {
+	case SSAM_POS_SOURCE_COVER:
+		return ssam_pos_state_name_cover(sw, state->state);
+
+	case SSAM_POS_SOURCE_SLS:
+		return ssam_pos_state_name_sls(sw, state->state);
+
+	default:
+		dev_warn(&sw->sdev->dev, "unknown device posture source: %u\n", state->source);
 		return "<unknown>";
 	}
 }
 
-static bool ssam_pos_state_is_tablet_mode(struct ssam_tablet_sw *sw, u32 state)
+static bool ssam_pos_state_is_tablet_mode_cover(struct ssam_tablet_sw *sw, u32 state)
+{
+	switch (state) {
+	case SSAM_POS_COVER_DISCONNECTED:
+	case SSAM_POS_COVER_FOLDED_CANVAS:
+	case SSAM_POS_COVER_FOLDED_BACK:
+	case SSAM_POS_COVER_BOOK:
+		return true;
+
+	case SSAM_POS_COVER_CLOSED:
+	case SSAM_POS_COVER_LAPTOP:
+		return false;
+
+	default:
+		dev_warn(&sw->sdev->dev, "unknown device posture for type-cover: %u\n", state);
+		return true;
+	}
+}
+
+static bool ssam_pos_state_is_tablet_mode_sls(struct ssam_tablet_sw *sw, u32 state)
 {
 	switch (state) {
-	case SSAM_POS_POSTURE_LAPTOP:
-	case SSAM_POS_POSTURE_LID_CLOSED:
+	case SSAM_POS_SLS_LAPTOP:
+	case SSAM_POS_SLS_LID_CLOSED:
 		return false;
 
-	case SSAM_POS_POSTURE_SLATE:
+	case SSAM_POS_SLS_SLATE:
 		return tablet_mode_in_slate_state;
 
-	case SSAM_POS_POSTURE_TABLET:
+	case SSAM_POS_SLS_TABLET:
+		return true;
+
+	default:
+		dev_warn(&sw->sdev->dev, "unknown device posture for SLS: %u\n", state);
 		return true;
+	}
+}
+
+static bool ssam_pos_state_is_tablet_mode(struct ssam_tablet_sw *sw,
+					  const struct ssam_tablet_sw_state *state)
+{
+	switch (state->source) {
+	case SSAM_POS_SOURCE_COVER:
+		return ssam_pos_state_is_tablet_mode_cover(sw, state->state);
+
+	case SSAM_POS_SOURCE_SLS:
+		return ssam_pos_state_is_tablet_mode_sls(sw, state->state);
 
 	default:
-		dev_warn(&sw->sdev->dev, "unknown device posture: %u\n", state);
+		dev_warn(&sw->sdev->dev, "unknown device posture source: %u\n", state->source);
 		return true;
 	}
 }
@@ -455,9 +559,10 @@ static int ssam_pos_get_posture_for_source(struct ssam_tablet_sw *sw, u32 source
 	return 0;
 }
 
-static int ssam_pos_get_posture(struct ssam_tablet_sw *sw, u32 *state)
+static int ssam_pos_get_posture(struct ssam_tablet_sw *sw, struct ssam_tablet_sw_state *state)
 {
 	u32 source_id;
+	u32 source_state;
 	int status;
 
 	status = ssam_pos_get_source(sw, &source_id);
@@ -466,13 +571,15 @@ static int ssam_pos_get_posture(struct ssam_tablet_sw *sw, u32 *state)
 		return status;
 	}
 
-	status = ssam_pos_get_posture_for_source(sw, source_id, state);
+	status = ssam_pos_get_posture_for_source(sw, source_id, &source_state);
 	if (status) {
 		dev_err(&sw->sdev->dev, "failed to get posture value for source %u: %d\n",
 			source_id, status);
 		return status;
 	}
 
+	state->source = source_id;
+	state->state = source_state;
 	return 0;
 }
 
diff --git a/drivers/platform/surface/surface_gpe.c b/drivers/platform/surface/surface_gpe.c
index c219b840d491..69c4352e8406 100644
--- a/drivers/platform/surface/surface_gpe.c
+++ b/drivers/platform/surface/surface_gpe.c
@@ -41,6 +41,11 @@ static const struct property_entry lid_device_props_l4F[] = {
 	{},
 };
 
+static const struct property_entry lid_device_props_l52[] = {
+	PROPERTY_ENTRY_U32("gpe", 0x52),
+	{},
+};
+
 static const struct property_entry lid_device_props_l57[] = {
 	PROPERTY_ENTRY_U32("gpe", 0x57),
 	{},
@@ -107,6 +112,18 @@ static const struct dmi_system_id dmi_lid_device_table[] = {
 		},
 		.driver_data = (void *)lid_device_props_l4B,
 	},
+	{
+		/*
+		 * We match for SKU here due to product name clash with the ARM
+		 * version.
+		 */
+		.ident = "Surface Pro 9",
+		.matches = {
+			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
+			DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "Surface_Pro_9_2038"),
+		},
+		.driver_data = (void *)lid_device_props_l52,
+	},
 	{
 		.ident = "Surface Book 1",
 		.matches = {
diff --git a/drivers/platform/surface/surfacebook1_dgpu_switch.c b/drivers/platform/surface/surfacebook1_dgpu_switch.c
new file mode 100644
index 000000000000..8b816ed8f35c
--- /dev/null
+++ b/drivers/platform/surface/surfacebook1_dgpu_switch.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/acpi.h>
+#include <linux/platform_device.h>
+
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+
+static const guid_t dgpu_sw_guid = GUID_INIT(0x6fd05c69, 0xcde3, 0x49f4,
+	0x95, 0xed, 0xab, 0x16, 0x65, 0x49, 0x80, 0x35);
+
+#define DGPUSW_ACPI_PATH_DSM	"\\_SB_.PCI0.LPCB.EC0_.VGBI"
+#define DGPUSW_ACPI_PATH_HGON	"\\_SB_.PCI0.RP05.HGON"
+#define DGPUSW_ACPI_PATH_HGOF	"\\_SB_.PCI0.RP05.HGOF"
+
+
+static int sb1_dgpu_sw_dsmcall(void)
+{
+	union acpi_object *ret;
+	acpi_handle handle;
+	acpi_status status;
+
+	status = acpi_get_handle(NULL, DGPUSW_ACPI_PATH_DSM, &handle);
+	if (status)
+		return -EINVAL;
+
+	ret = acpi_evaluate_dsm_typed(handle, &dgpu_sw_guid, 1, 1, NULL, ACPI_TYPE_BUFFER);
+	if (!ret)
+		return -EINVAL;
+
+	ACPI_FREE(ret);
+	return 0;
+}
+
+static int sb1_dgpu_sw_hgon(void)
+{
+	struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL};
+	acpi_status status;
+
+	status = acpi_evaluate_object(NULL, DGPUSW_ACPI_PATH_HGON, NULL, &buf);
+	if (status) {
+		pr_err("failed to run HGON: %d\n", status);
+		return -EINVAL;
+	}
+
+	if (buf.pointer)
+		ACPI_FREE(buf.pointer);
+
+	pr_info("turned-on dGPU via HGON\n");
+	return 0;
+}
+
+static int sb1_dgpu_sw_hgof(void)
+{
+	struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL};
+	acpi_status status;
+
+	status = acpi_evaluate_object(NULL, DGPUSW_ACPI_PATH_HGOF, NULL, &buf);
+	if (status) {
+		pr_err("failed to run HGOF: %d\n", status);
+		return -EINVAL;
+	}
+
+	if (buf.pointer)
+		ACPI_FREE(buf.pointer);
+
+	pr_info("turned-off dGPU via HGOF\n");
+	return 0;
+}
+
+
+static ssize_t dgpu_dsmcall_store(struct device *dev, struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	int status, value;
+
+	status = kstrtoint(buf, 0, &value);
+	if (status < 0)
+		return status;
+
+	if (value != 1)
+		return -EINVAL;
+
+	status = sb1_dgpu_sw_dsmcall();
+
+	return status < 0 ? status : len;
+}
+
+static ssize_t dgpu_power_store(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	bool power;
+	int status;
+
+	status = kstrtobool(buf, &power);
+	if (status < 0)
+		return status;
+
+	if (power)
+		status = sb1_dgpu_sw_hgon();
+	else
+		status = sb1_dgpu_sw_hgof();
+
+	return status < 0 ? status : len;
+}
+
+static DEVICE_ATTR_WO(dgpu_dsmcall);
+static DEVICE_ATTR_WO(dgpu_power);
+
+static struct attribute *sb1_dgpu_sw_attrs[] = {
+	&dev_attr_dgpu_dsmcall.attr,
+	&dev_attr_dgpu_power.attr,
+	NULL,
+};
+
+static const struct attribute_group sb1_dgpu_sw_attr_group = {
+	.attrs = sb1_dgpu_sw_attrs,
+};
+
+
+static int sb1_dgpu_sw_probe(struct platform_device *pdev)
+{
+	return sysfs_create_group(&pdev->dev.kobj, &sb1_dgpu_sw_attr_group);
+}
+
+static int sb1_dgpu_sw_remove(struct platform_device *pdev)
+{
+	sysfs_remove_group(&pdev->dev.kobj, &sb1_dgpu_sw_attr_group);
+	return 0;
+}
+
+/*
+ * The dGPU power seems to be actually handled by MSHW0040. However, that is
+ * also the power-/volume-button device with a mainline driver. So let's use
+ * MSHW0041 instead for now, which seems to be the LTCH (latch/DTX) device.
+ */
+static const struct acpi_device_id sb1_dgpu_sw_match[] = {
+	{ "MSHW0041", },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, sb1_dgpu_sw_match);
+
+static struct platform_driver sb1_dgpu_sw = {
+	.probe = sb1_dgpu_sw_probe,
+	.remove = sb1_dgpu_sw_remove,
+	.driver = {
+		.name = "surfacebook1_dgpu_switch",
+		.acpi_match_table = sb1_dgpu_sw_match,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+module_platform_driver(sb1_dgpu_sw);
+
+MODULE_AUTHOR("Maximilian Luz <luzmaximilian@gmail.com>");
+MODULE_DESCRIPTION("Discrete GPU Power-Switch for Surface Book 1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/platform/surface/surfacepro3_button.c b/drivers/platform/surface/surfacepro3_button.c
index 2755601f979c..4240c98ca226 100644
--- a/drivers/platform/surface/surfacepro3_button.c
+++ b/drivers/platform/surface/surfacepro3_button.c
@@ -149,7 +149,8 @@ static int surface_button_resume(struct device *dev)
 /*
  * Surface Pro 4 and Surface Book 2 / Surface Pro 2017 use the same device
  * ID (MSHW0040) for the power/volume buttons. Make sure this is the right
- * device by checking for the _DSM method and OEM Platform Revision.
+ * device by checking for the _DSM method and OEM Platform Revision DSM
+ * function.
  *
  * Returns true if the driver should bind to this device, i.e. the device is
  * either MSWH0028 (Pro 3) or MSHW0040 on a Pro 4 or Book 1.
@@ -157,30 +158,11 @@ static int surface_button_resume(struct device *dev)
 static bool surface_button_check_MSHW0040(struct acpi_device *dev)
 {
 	acpi_handle handle = dev->handle;
-	union acpi_object *result;
-	u64 oem_platform_rev = 0;	// valid revisions are nonzero
-
-	// get OEM platform revision
-	result = acpi_evaluate_dsm_typed(handle, &MSHW0040_DSM_UUID,
-					 MSHW0040_DSM_REVISION,
-					 MSHW0040_DSM_GET_OMPR,
-					 NULL, ACPI_TYPE_INTEGER);
-
-	/*
-	 * If evaluating the _DSM fails, the method is not present. This means
-	 * that we have either MSHW0028 or MSHW0040 on Pro 4 or Book 1, so we
-	 * should use this driver. We use revision 0 indicating it is
-	 * unavailable.
-	 */
-
-	if (result) {
-		oem_platform_rev = result->integer.value;
-		ACPI_FREE(result);
-	}
-
-	dev_dbg(&dev->dev, "OEM Platform Revision %llu\n", oem_platform_rev);
 
-	return oem_platform_rev == 0;
+	// make sure that OEM platform revision DSM call does not exist
+	return !acpi_check_dsm(handle, &MSHW0040_DSM_UUID,
+			       MSHW0040_DSM_REVISION,
+			       BIT(MSHW0040_DSM_GET_OMPR));
 }
 
 
diff --git a/drivers/platform/x86/intel/int3472/discrete.c b/drivers/platform/x86/intel/int3472/discrete.c
index f064da74f50a..2064b3bbe530 100644
--- a/drivers/platform/x86/intel/int3472/discrete.c
+++ b/drivers/platform/x86/intel/int3472/discrete.c
@@ -98,6 +98,9 @@ static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int347
 {
 	const struct int3472_sensor_config *sensor_config;
 	char *path = agpio->resource_source.string_ptr;
+	const struct acpi_device_id ov7251_ids[] = {
+		{ "INT347E" },
+	};
 	struct gpiod_lookup *table_entry;
 	struct acpi_device *adev;
 	acpi_handle handle;
@@ -120,6 +123,17 @@ static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int347
 		}
 	}
 
+	/*
+	 * In addition to the function remap table we need to bulk remap the
+	 * "reset" GPIO for the OmniVision 7251 sensor, as the driver for that
+	 * expects its only GPIO pin to be called "enable" (and to have the
+	 * opposite polarity).
+	 */
+	if (!strcmp(func, "reset") && !acpi_match_device_ids(int3472->sensor, ov7251_ids)) {
+		func = "enable";
+		polarity = GPIO_ACTIVE_HIGH;
+	}
+
 	/* Functions mapped to NULL should not be mapped to the sensor */
 	if (!func)
 		return 0;
diff --git a/drivers/platform/x86/intel/int3472/tps68470.c b/drivers/platform/x86/intel/int3472/tps68470.c
index 5b8d1a9620a5..2a7d01d3abc8 100644
--- a/drivers/platform/x86/intel/int3472/tps68470.c
+++ b/drivers/platform/x86/intel/int3472/tps68470.c
@@ -17,7 +17,7 @@
 #define DESIGNED_FOR_CHROMEOS		1
 #define DESIGNED_FOR_WINDOWS		2
 
-#define TPS68470_WIN_MFD_CELL_COUNT	3
+#define TPS68470_WIN_MFD_CELL_COUNT	4
 
 static const struct mfd_cell tps68470_cros[] = {
 	{ .name = "tps68470-gpio" },
@@ -46,6 +46,13 @@ static int tps68470_chip_init(struct device *dev, struct regmap *regmap)
 		return ret;
 	}
 
+	/* Enable I2C daisy chain */
+	ret = regmap_write(regmap, TPS68470_REG_S_I2C_CTL, 0x03);
+	if (ret) {
+		dev_err(dev, "Failed to enable i2c daisy chain\n");
+		return ret;
+	}
+
 	dev_info(dev, "TPS68470 REVID: 0x%02x\n", version);
 
 	return 0;
@@ -193,7 +200,8 @@ static int skl_int3472_tps68470_probe(struct i2c_client *client)
 		cells[1].name = "tps68470-regulator";
 		cells[1].platform_data = (void *)board_data->tps68470_regulator_pdata;
 		cells[1].pdata_size = sizeof(struct tps68470_regulator_platform_data);
-		cells[2].name = "tps68470-gpio";
+		cells[2].name = "tps68470-led";
+		cells[3].name = "tps68470-gpio";
 
 		for (i = 0; i < board_data->n_gpiod_lookups; i++)
 			gpiod_add_lookup_table(board_data->tps68470_gpio_lookup_tables[i]);
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 934b3d997702..2c6604c6e8e1 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -220,6 +220,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Microsoft Surface Dock Ethernet (RTL8153 GigE) */
 	{ USB_DEVICE(0x045e, 0x07c6), .driver_info = USB_QUIRK_NO_LPM },
 
+	/* Microsoft Surface Go 3 Type-Cover */
+	{ USB_DEVICE(0x045e, 0x09b5), .driver_info = USB_QUIRK_DELAY_INIT },
+
 	/* Cherry Stream G230 2.0 (G85-231) and 3.0 (G85-232) */
 	{ USB_DEVICE(0x046a, 0x0023), .driver_info = USB_QUIRK_RESET_RESUME },
 
diff --git a/include/linux/mfd/tps68470.h b/include/linux/mfd/tps68470.h
index 7807fa329db0..2d2abb25b944 100644
--- a/include/linux/mfd/tps68470.h
+++ b/include/linux/mfd/tps68470.h
@@ -34,6 +34,7 @@
 #define TPS68470_REG_SGPO		0x22
 #define TPS68470_REG_GPDI		0x26
 #define TPS68470_REG_GPDO		0x27
+#define TPS68470_REG_ILEDCTL		0x28
 #define TPS68470_REG_VCMVAL		0x3C
 #define TPS68470_REG_VAUX1VAL		0x3D
 #define TPS68470_REG_VAUX2VAL		0x3E
@@ -94,4 +95,8 @@
 #define TPS68470_GPIO_MODE_OUT_CMOS	2
 #define TPS68470_GPIO_MODE_OUT_ODRAIN	3
 
+#define TPS68470_ILEDCTL_ENA		BIT(2)
+#define TPS68470_ILEDCTL_ENB		BIT(6)
+#define TPS68470_ILEDCTL_CTRLB		GENMASK(5, 4)
+
 #endif /* __LINUX_MFD_TPS68470_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a5dda515fcd1..69f6fc707ae5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -464,6 +464,7 @@ struct pci_dev {
 	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
 	unsigned int	no_command_memory:1;	/* No PCI_COMMAND_MEMORY */
 	unsigned int	rom_bar_overlap:1;	/* ROM BAR disable broken */
+	unsigned int	no_shutdown:1;		/* Do not touch device on shutdown */
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index df81043b9e71..42b249b4c24b 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -243,11 +243,7 @@ static inline bool is_ssam_device(struct device *d)
  * Return: Returns the pointer to the &struct ssam_device_driver wrapping the
  * given device driver @d.
  */
-static inline
-struct ssam_device_driver *to_ssam_device_driver(struct device_driver *d)
-{
-	return container_of(d, struct ssam_device_driver, driver);
-}
+#define to_ssam_device_driver(d)	container_of_const(d, struct ssam_device_driver, driver)
 
 const struct ssam_device_id *ssam_device_id_match(const struct ssam_device_id *table,
 						  const struct ssam_device_uid uid);
diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c
index 7c7cbb6362ea..81a8ff40e86e 100644
--- a/sound/soc/codecs/rt5645.c
+++ b/sound/soc/codecs/rt5645.c
@@ -3717,6 +3717,15 @@ static const struct dmi_system_id dmi_platform_data[] = {
 		},
 		.driver_data = (void *)&intel_braswell_platform_data,
 	},
+	{
+		.ident = "Microsoft Surface 3",
+		.matches = {
+			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+			DMI_MATCH(DMI_SYS_VENDOR, "OEMB"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OEMB"),
+		},
+		.driver_data = (void *)&intel_braswell_platform_data,
+	},
 	{
 		/*
 		 * Match for the GPDwin which unfortunately uses somewhat
diff --git a/sound/soc/intel/common/soc-acpi-intel-cht-match.c b/sound/soc/intel/common/soc-acpi-intel-cht-match.c
index cdcbf04b8832..958305779b12 100644
--- a/sound/soc/intel/common/soc-acpi-intel-cht-match.c
+++ b/sound/soc/intel/common/soc-acpi-intel-cht-match.c
@@ -27,6 +27,14 @@ static const struct dmi_system_id cht_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "Surface 3"),
 		},
 	},
+	{
+		.callback = cht_surface_quirk_cb,
+		.matches = {
+			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+			DMI_MATCH(DMI_SYS_VENDOR, "OEMB"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OEMB"),
+		},
+	},
 	{ }
 };
 
-- 
2.41.0

From 7445b6718ae1b5157f2eb2183a55c5c28193c04f Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sat, 22 Apr 2023 11:46:46 +0200
Subject: [PATCH 12/12] zstd: import 1.5.5

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/zstd.h                          |    2 +-
 include/linux/zstd_errors.h                   |   23 +-
 include/linux/zstd_lib.h                      |  703 +++++--
 lib/zstd/Makefile                             |    2 +-
 lib/zstd/common/allocations.h                 |   56 +
 lib/zstd/common/bits.h                        |  149 ++
 lib/zstd/common/bitstream.h                   |   53 +-
 lib/zstd/common/compiler.h                    |   14 +-
 lib/zstd/common/cpu.h                         |    3 +-
 lib/zstd/common/debug.c                       |    3 +-
 lib/zstd/common/debug.h                       |    3 +-
 lib/zstd/common/entropy_common.c              |   42 +-
 lib/zstd/common/error_private.c               |   12 +-
 lib/zstd/common/error_private.h               |    3 +-
 lib/zstd/common/fse.h                         |   89 +-
 lib/zstd/common/fse_decompress.c              |   94 +-
 lib/zstd/common/huf.h                         |  222 +--
 lib/zstd/common/mem.h                         |    2 +-
 lib/zstd/common/portability_macros.h          |   26 +-
 lib/zstd/common/zstd_common.c                 |   38 +-
 lib/zstd/common/zstd_deps.h                   |    2 +-
 lib/zstd/common/zstd_internal.h               |   99 +-
 lib/zstd/compress/clevels.h                   |    3 +-
 lib/zstd/compress/fse_compress.c              |   59 +-
 lib/zstd/compress/hist.c                      |    3 +-
 lib/zstd/compress/hist.h                      |    3 +-
 lib/zstd/compress/huf_compress.c              |  372 ++--
 lib/zstd/compress/zstd_compress.c             | 1762 ++++++++++++-----
 lib/zstd/compress/zstd_compress_internal.h    |  333 +++-
 lib/zstd/compress/zstd_compress_literals.c    |  155 +-
 lib/zstd/compress/zstd_compress_literals.h    |   25 +-
 lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
 lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
 lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
 lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
 lib/zstd/compress/zstd_cwksp.h                |  149 +-
 lib/zstd/compress/zstd_double_fast.c          |  129 +-
 lib/zstd/compress/zstd_double_fast.h          |    6 +-
 lib/zstd/compress/zstd_fast.c                 |  582 ++++--
 lib/zstd/compress/zstd_fast.h                 |    6 +-
 lib/zstd/compress/zstd_lazy.c                 |  518 ++---
 lib/zstd/compress/zstd_lazy.h                 |    7 +-
 lib/zstd/compress/zstd_ldm.c                  |   11 +-
 lib/zstd/compress/zstd_ldm.h                  |    3 +-
 lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
 lib/zstd/compress/zstd_opt.c                  |  187 +-
 lib/zstd/compress/zstd_opt.h                  |    3 +-
 lib/zstd/decompress/huf_decompress.c          |  731 ++++---
 lib/zstd/decompress/zstd_ddict.c              |    9 +-
 lib/zstd/decompress/zstd_ddict.h              |    3 +-
 lib/zstd/decompress/zstd_decompress.c         |  269 ++-
 lib/zstd/decompress/zstd_decompress_block.c   |  283 ++-
 lib/zstd/decompress/zstd_decompress_block.h   |    8 +-
 .../decompress/zstd_decompress_internal.h     |    7 +-
 lib/zstd/decompress_sources.h                 |    2 +-
 lib/zstd/zstd_common_module.c                 |    5 +-
 lib/zstd/zstd_compress_module.c               |    2 +-
 lib/zstd/zstd_decompress_module.c             |    4 +-
 58 files changed, 4731 insertions(+), 2611 deletions(-)
 create mode 100644 lib/zstd/common/allocations.h
 create mode 100644 lib/zstd/common/bits.h

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 113408eef6ec..f109d49f43f8 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
index 58b6dd45a969..6d5cf55f0bf3 100644
--- a/include/linux/zstd_errors.h
+++ b/include/linux/zstd_errors.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -17,8 +18,17 @@
 
 
 /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
-#define ZSTDERRORLIB_VISIBILITY 
-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#define ZSTDERRORLIB_VISIBLE 
+
+#ifndef ZSTDERRORLIB_HIDDEN
+#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDERRORLIB_HIDDEN
+#  endif
+#endif
+
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
 
 /*-*********************************************
  *  Error codes list
@@ -43,14 +53,17 @@ typedef enum {
   ZSTD_error_frameParameter_windowTooLarge = 16,
   ZSTD_error_corruption_detected = 20,
   ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_literals_headerWrong = 24,
   ZSTD_error_dictionary_corrupted      = 30,
   ZSTD_error_dictionary_wrong          = 32,
   ZSTD_error_dictionaryCreation_failed = 34,
   ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_combination_unsupported = 41,
   ZSTD_error_parameter_outOfBound    = 42,
   ZSTD_error_tableLog_tooLarge       = 44,
   ZSTD_error_maxSymbolValue_tooLarge = 46,
   ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stabilityCondition_notRespected = 50,
   ZSTD_error_stage_wrong       = 60,
   ZSTD_error_init_missing      = 62,
   ZSTD_error_memory_allocation = 64,
@@ -58,11 +71,15 @@ typedef enum {
   ZSTD_error_dstSize_tooSmall = 70,
   ZSTD_error_srcSize_wrong    = 72,
   ZSTD_error_dstBuffer_null   = 74,
+  ZSTD_error_noForwardProgress_destFull = 80,
+  ZSTD_error_noForwardProgress_inputEmpty = 82,
   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
   ZSTD_error_frameIndex_tooLarge = 100,
   ZSTD_error_seekableIO          = 102,
   ZSTD_error_dstBuffer_wrong     = 104,
   ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_sequenceProducer_failed = 106,
+  ZSTD_error_externalSequences_invalid = 107,
   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
 } ZSTD_ErrorCode;
 
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
index 79d55465d5c1..738fe8ea4ead 100644
--- a/include/linux/zstd_lib.h
+++ b/include/linux/zstd_lib.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,23 +12,42 @@
 #ifndef ZSTD_H_235446
 #define ZSTD_H_235446
 
-/* ======   Dependency   ======*/
+/* ======   Dependencies   ======*/
 #include <linux/limits.h>   /* INT_MAX */
 #include <linux/types.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
-#ifndef ZSTDLIB_VISIBLE
+#define ZSTDLIB_VISIBLE 
+
+#ifndef ZSTDLIB_HIDDEN
 #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
 #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
 #  else
-#    define ZSTDLIB_VISIBLE
 #    define ZSTDLIB_HIDDEN
 #  endif
 #endif
+
 #define ZSTDLIB_API ZSTDLIB_VISIBLE
 
+/* Deprecation warnings :
+ * Should these warnings be a problem, it is generally possible to disable them,
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+ */
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (__GNUC__ >= 3)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+#    define ZSTD_DEPRECATED(message)
+#  endif
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+
 
 /* *****************************************************************************
   Introduction
@@ -65,7 +85,7 @@
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    5
-#define ZSTD_VERSION_RELEASE  2
+#define ZSTD_VERSION_RELEASE  5
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 
 /*! ZSTD_versionNumber() :
@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
 ***************************************/
 /*! ZSTD_compress() :
  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data.
  *  @return : compressed size written into `dst` (<= `dstCapacity),
  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
  *  "empty", "unknown" and "error" results to the same return value (0),
  *  while ZSTD_getFrameContentSize() gives them separate return values.
  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+ZSTDLIB_API
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
  * `src` should point to the start of a ZSTD frame or skippable frame.
@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
 
 
 /*======  Helper functions  ======*/
-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_compressBound() :
+ * maximum compressed size in worst case single-pass scenario.
+ * When invoking `ZSTD_compress()` or any other one-pass compression function,
+ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
+ * as it eliminates one potential failure scenario,
+ * aka not enough room in dst buffer to write the compressed frame.
+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
+ *        In which case, ZSTD_compressBound() will return an error code
+ *        which can be tested using ZSTD_isError().
+ *
+ * ZSTD_COMPRESSBOUND() :
+ * same as ZSTD_compressBound(), but as a macro.
+ * It can be used to produce constants, which can be useful for static allocation,
+ * for example to size a static array on stack.
+ * Will produce constant value 0 if srcSize too large.
+ */
+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
+#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_isError() :
+ * Most ZSTD_* functions returning a size_t value can be tested for error,
+ * using ZSTD_isError().
+ * @return 1 if error, 0 otherwise
+ */
 ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
 ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
@@ -412,6 +457,9 @@ typedef enum {
      * ZSTD_c_validateSequences
      * ZSTD_c_useBlockSplitter
      * ZSTD_c_useRowMatchFinder
+     * ZSTD_c_prefetchCDictTables
+     * ZSTD_c_enableSeqProducerFallback
+     * ZSTD_c_maxBlockSize
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly;
      *        also, the enums values themselves are unstable and can still change.
@@ -430,7 +478,11 @@ typedef enum {
      ZSTD_c_experimentalParam12=1009,
      ZSTD_c_experimentalParam13=1010,
      ZSTD_c_experimentalParam14=1011,
-     ZSTD_c_experimentalParam15=1012
+     ZSTD_c_experimentalParam15=1012,
+     ZSTD_c_experimentalParam16=1013,
+     ZSTD_c_experimentalParam17=1014,
+     ZSTD_c_experimentalParam18=1015,
+     ZSTD_c_experimentalParam19=1016
 } ZSTD_cParameter;
 
 typedef struct {
@@ -493,7 +545,7 @@ typedef enum {
  *                  They will be used to compress next frame.
  *                  Resetting session never fails.
  *  - The parameters : changes all parameters back to "default".
- *                  This removes any reference to any dictionary too.
+ *                  This also removes any reference to any dictionary or external sequence producer.
  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
  *  - Both : similar to resetting the session, followed by resetting parameters.
@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
  *  - The function is always blocking, returns when compression is completed.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
  * @return : compressed size written into `dst` (<= `dstCapacity),
  *           or an error code if it fails (which can be tested using ZSTD_isError()).
  */
@@ -543,13 +596,15 @@ typedef enum {
      * ZSTD_d_stableOutBuffer
      * ZSTD_d_forceIgnoreChecksum
      * ZSTD_d_refMultipleDDicts
+     * ZSTD_d_disableHuffmanAssembly
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly
      */
      ZSTD_d_experimentalParam1=1000,
      ZSTD_d_experimentalParam2=1001,
      ZSTD_d_experimentalParam3=1002,
-     ZSTD_d_experimentalParam4=1003
+     ZSTD_d_experimentalParam4=1003,
+     ZSTD_d_experimentalParam5=1004
 
 } ZSTD_dParameter;
 
@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  * This following is a legacy streaming API, available since v1.0+ .
  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
  * It is redundant, but remains fully supported.
- * Streaming in combination with advanced parameters and dictionary compression
- * can only be used through the new API.
  ******************************************************************************/
 
 /*!
@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *
+ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
+ * to compress with a dictionary.
  */
 ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
 /*!
@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
 
 /*===== Streaming decompression functions =====*/
 
-/* This function is redundant with the advanced API and equivalent to:
+/*! ZSTD_initDStream() :
+ * Initialize/reset DStream state for new decompression operation.
+ * Call before new decompression operation using same DStream.
  *
+ * Note : This function is redundant with the advanced API and equivalent to:
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *     ZSTD_DCtx_refDDict(zds, NULL);
  */
 ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
 
+/*! ZSTD_decompressStream() :
+ * Streaming decompression function.
+ * Call repetitively to consume full input updating it as necessary.
+ * Function will update both input and output `pos` fields exposing current state via these fields:
+ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
+ *   on the next call.
+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
+ *   call ZSTD_decompressStream() again to flush remaining data to output.
+ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
+ *
+ * @return : 0 when a frame is completely decoded and fully flushed,
+ *           or an error code, which can be tested using ZSTD_isError(),
+ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
+ */
 ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
 
 ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
  *  If @return == 0, the dictID could not be decoded.
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary to be decoded (most common case).
- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
  *  - This is not a Zstandard frame.
@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * Advanced dictionary and prefix API (Requires v1.4.0+)
  *
  * This API allows dictionaries to be used with ZSTD_compress2(),
- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
- * only reset with the context is reset with ZSTD_reset_parameters or
- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
+ * Dictionaries are sticky, they remain valid when same context is re-used,
+ * they only reset when the context is reset
+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
+ * In contrast, Prefixes are single-use.
  ******************************************************************************/
 
 
@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
  *           meaning "return to no-dictionary mode".
- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
+ *           until parameters are reset, a new dictionary is loaded, or the dictionary
+ *           is explicitly invalidated by loading a NULL dictionary.
  *  Note 2 : Loading a dictionary involves building tables.
  *           It's also a CPU consuming operation, with non-negligible impact on latency.
  *           Tables are dependent on compression parameters, and for this reason,
@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
  *           In such a case, dictionary buffer must outlive its users.
  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
- *           to precisely select how dictionary content must be interpreted. */
+ *           to precisely select how dictionary content must be interpreted.
+ *  Note 5 : This method does not benefit from LDM (long distance mode).
+ *           If you want to employ LDM on some large dictionary content,
+ *           prefer employing ZSTD_CCtx_refPrefix() described below.
+ */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 
 /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
- *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Reference a prepared dictionary, to be used for all future compressed frames.
  *  Note that compression parameters are enforced from within CDict,
  *  and supersede any compression parameter previously set within CCtx.
  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
  *  Decompression will need same prefix to properly regenerate data.
  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  This method is compatible with LDM (long distance mode).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                  const void* prefix, size_t prefixSize);
 
 /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
- *  Create an internal DDict from dict buffer,
- *  to be used to decompress next frames.
- *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
+ *  a new dictionary is loaded.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
  *            meaning "return to no-dictionary mode".
@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
  *  The memory for the table is allocated on the first call to refDDict, and can be
  *  freed with ZSTD_freeDCtx().
  *
+ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
+ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
+ *
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Note 1 : Currently, only one dictionary can be managed.
- *           Referencing a new dictionary effectively "discards" any previous one.
  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
  */
@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
 #endif
 
-/* Deprecation warnings :
- * Should these warnings be a problem, it is generally possible to disable them,
- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
- */
-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
-#else
-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
-#  elif (__GNUC__ >= 3)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
-#  else
-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
-#  endif
-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
-
 /* **************************************************************************************
  *   experimental API (static linking only)
  ****************************************************************************************
@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
 #define ZSTD_STRATEGY_MIN        ZSTD_fast
 #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
 
 
 #define ZSTD_OVERLAPLOG_MIN       0
@@ -1303,7 +1369,7 @@ typedef enum {
 } ZSTD_paramSwitch_e;
 
 /* *************************************
-*  Frame size functions
+*  Frame header and size functions
 ***************************************/
 
 /*! ZSTD_findDecompressedSize() :
@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
  *           or an error code (if srcSize is too small) */
 ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+    unsigned _reserved1;
+    unsigned _reserved2;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+
+/*! ZSTD_decompressionMargin() :
+ * Zstd supports in-place decompression, where the input and output buffers overlap.
+ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
+ * and the input buffer must be at the end of the output buffer.
+ *
+ *  _______________________ Output Buffer ________________________
+ * |                                                              |
+ * |                                        ____ Input Buffer ____|
+ * |                                       |                      |
+ * v                                       v                      v
+ * |---------------------------------------|-----------|----------|
+ * ^                                                   ^          ^
+ * |___________________ Output_Size ___________________|_ Margin _|
+ *
+ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
+ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
+ * ZSTD_decompressDCtx().
+ * NOTE: This function supports multi-frame input.
+ *
+ * @param src The compressed frame(s)
+ * @param srcSize The size of the compressed frame(s)
+ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
+
+/*! ZSTD_DECOMPRESS_MARGIN() :
+ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
+ * the compressed frame, compute it from the original size and the blockSizeLog.
+ * See ZSTD_decompressionMargin() for details.
+ *
+ * WARNING: This macro does not support multi-frame input, the input must be a single
+ * zstd frame. If you need that support use the function, or implement it yourself.
+ *
+ * @param originalSize The original uncompressed size of the data.
+ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
+ *                     Unless you explicitly set the windowLog smaller than
+ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
+ */
+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
+        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
+        4                                                                                         /* checksum */ + \
+        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
+        (blockSize)                                                                    /* One block of margin */   \
+    ))
+
 typedef enum {
   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
 } ZSTD_sequenceFormat_e;
 
+/*! ZSTD_sequenceBound() :
+ * `srcSize` : size of the input buffer
+ *  @return : upper-bound for the number of sequences that can be generated
+ *            from a buffer of srcSize bytes
+ *
+ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
+
 /*! ZSTD_generateSequences() :
- * Generate sequences using ZSTD_compress2, given a source buffer.
+ * Generate sequences using ZSTD_compress2(), given a source buffer.
  *
  * Each block will end with a dummy sequence
  * with offset == 0, matchLength == 0, and litLength == length of last literals.
  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
  * simply acts as a block delimiter.
  *
- * zc can be used to insert custom compression params.
- * This function invokes ZSTD_compress2
+ * @zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2().
  *
  * The output of this function can be fed into ZSTD_compressSequences() with CCtx
  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
  * @return : number of sequences generated
  */
 
-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
-                                          size_t outSeqsSize, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_generateSequences( ZSTD_CCtx* zc,
+                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
+                        const void* src, size_t srcSize);
 
 /*! ZSTD_mergeBlockDelimiters() :
  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
 ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
 
 /*! ZSTD_compressSequences() :
- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
+ * @src contains the entire input (not just the literals).
+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
  * The entire source is compressed into a single frame.
  *
@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
  *         and cannot emit an RLE block that disagrees with the repcode history
- * @return : final compressed size or a ZSTD error.
+ * @return : final compressed size, or a ZSTD error code.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                                  const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
+                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                        const void* src, size_t srcSize);
 
 
 /*! ZSTD_writeSkippableFrame() :
@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
  *
- *  Note 2 : only single-threaded compression is supported.
+ *  Note : only single-threaded compression is supported.
  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *
+ *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
  */
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
  *         an internal ?Dict will be created, which additional size is not estimated here.
- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ *         In this case, get total size by adding ZSTD_estimate?DictSize
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
+ */
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
  *  This function never fails (wide contract) */
 ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
 
+/*! ZSTD_CCtx_setCParams() :
+ *  Set all parameters provided within @p cparams into the working @p cctx.
+ *  Note : if modifying parameters during compression (MT mode only),
+ *         note that changes to the .windowLog parameter will be ignored.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ *         On failure, no parameters are updated.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
+
+/*! ZSTD_CCtx_setFParams() :
+ *  Set all parameters provided within @p fparams into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
+
+/*! ZSTD_CCtx_setParams() :
+ *  Set all parameters provided within @p params into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
+
 /*! ZSTD_compress_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
-                                          void* dst, size_t dstCapacity,
-                                    const void* src, size_t srcSize,
-                                    const void* dict,size_t dictSize,
-                                          ZSTD_parameters params);
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                        const void* dict,size_t dictSize,
+                              ZSTD_parameters params);
 
 /*! ZSTD_compress_usingCDict_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                               void* dst, size_t dstCapacity,
                                         const void* src, size_t srcSize,
@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Experimental parameter.
  * Default is 0 == disabled. Set to 1 to enable.
  *
- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
- * between calls, except for the modifications that zstd makes to pos (the
- * caller must not modify pos). This is checked by the compressor, and
- * compression will fail if it ever changes. This means the only flush
- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
- * MUST not be modified during compression or you will get data corruption.
+ * Tells the compressor that input data presented with ZSTD_inBuffer
+ * will ALWAYS be the same between calls.
+ * Technically, the @src pointer must never be changed,
+ * and the @pos field can only be updated by zstd.
+ * However, it's possible to increase the @size field,
+ * allowing scenarios where more data can be appended after compressions starts.
+ * These conditions are checked by the compressor,
+ * and compression will fail if they are not respected.
+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
+ * MUST not be modified during compression or it will result in data corruption.
  *
  * When this flag is enabled zstd won't allocate an input window buffer,
  * because the user guarantees it can reference the ZSTD_inBuffer until
@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
  * avoid the memcpy() from the input buffer to the input window buffer.
  *
- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
- * That means this flag cannot be used with ZSTD_compressStream().
- *
  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
  * this flag is ALWAYS memory safe, and will never access out-of-bounds
- * memory. However, compression WILL fail if you violate the preconditions.
+ * memory. However, compression WILL fail if conditions are not respected.
  *
- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
- * not be modified during compression or you will get data corruption. This
- * is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
+ * not be modified during compression or it will result in data corruption.
+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
  * matches. Normally zstd maintains its own window buffer for this purpose,
- * but passing this flag tells zstd to use the user provided buffer.
+ * but passing this flag tells zstd to rely on user provided buffer instead.
  */
 #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
 
@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Without validation, providing a sequence that does not conform to the zstd spec will cause
  * undefined behavior, and may produce a corrupted block.
  *
- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
  * specifics regarding offset/matchlength requirements) then the function will bail out and
  * return an error.
  *
@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
 
+/* ZSTD_c_prefetchCDictTables
+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
+ *
+ * In some situations, zstd uses CDict tables in-place rather than copying them
+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
+ * In such situations, compression speed is seriously impacted when CDict tables are
+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
+ * when they are used in-place.
+ *
+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
+ * into the working context, so there is no need to prefetch. This parameter is
+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
+ * useful but memcpy() is too expensive. The exact range of input sizes where this
+ * makes sense is best determined by careful experimentation.
+ *
+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
+ * but in the future zstd may conditionally enable this feature via an auto-detection
+ * heuristic for cold CDicts.
+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
+ */
+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
+
+/* ZSTD_c_enableSeqProducerFallback
+ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
+ *
+ * Controls whether zstd will fall back to an internal sequence producer if an
+ * external sequence producer is registered and returns an error code. This fallback
+ * is block-by-block: the internal sequence producer will only be called for blocks
+ * where the external sequence producer returns an error code. Fallback parsing will
+ * follow any other cParam settings, such as compression level, the same as in a
+ * normal (fully-internal) compression operation.
+ *
+ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
+ * documentation (below) before setting this parameter. */
+#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
+
+/* ZSTD_c_maxBlockSize
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
+ *
+ * This parameter can be used to set an upper bound on the blocksize
+ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
+ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
+ * compressBound() inaccurate). Only currently meant to be used for testing.
+ *
+ */
+#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
+
+/* ZSTD_c_searchForExternalRepcodes
+ * This parameter affects how zstd parses external sequences, such as sequences
+ * provided through the compressSequences() API or from an external block-level
+ * sequence producer.
+ *
+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
+ * external sequences, even if those repcodes are not explicitly indicated in
+ * the "rep" field. Note that this is the only way to exploit repcode matches
+ * while using compressSequences() or an external sequence producer, since zstd
+ * currently ignores the "rep" field of external sequences.
+ *
+ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
+ * external sequences, regardless of whether the "rep" field has been set. This
+ * reduces sequence compression overhead by about 25% while sacrificing some
+ * compression ratio.
+ *
+ * The default value is ZSTD_ps_auto, for which the library will enable/disable
+ * based on compression level.
+ *
+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
+ */
+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
+
 /*! ZSTD_CCtx_getParameter() :
  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
  *  and store it into int* value.
@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  * in the range [dst, dst + pos) MUST not be modified during decompression
  * or you will get data corruption.
  *
- * When this flags is enabled zstd won't allocate an output buffer, because
+ * When this flag is enabled zstd won't allocate an output buffer, because
  * it can write directly to the ZSTD_outBuffer, but it will still allocate
  * an input buffer large enough to fit any compressed block. This will also
  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  */
 #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
 
+/* ZSTD_d_disableHuffmanAssembly
+ * Set to 1 to disable the Huffman assembly implementation.
+ * The default value is 0, which allows zstd to use the Huffman assembly
+ * implementation if available.
+ *
+ * This parameter can be used to disable Huffman assembly at runtime.
+ * If you want to disable it at compile time you can define the macro
+ * ZSTD_DISABLE_ASM.
+ */
+#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
+
 
 /*! ZSTD_DCtx_setFormat() :
  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  *  such ZSTD_f_zstd1_magicless for example.
  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
 ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
+ZSTDLIB_STATIC_API
 size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 /*! ZSTD_decompressStream_simpleArgs() :
@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                          int compressionLevel,
                          unsigned long long pledgedSrcSize);
@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            int compressionLevel);
 
 /*! ZSTD_initCStream_advanced() :
- * This function is DEPRECATED, and is approximately equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
- *     for ((param, value) : params) {
- *         ZSTD_CCtx_setParameter(zcs, param, value);
- *     }
+ *     ZSTD_CCtx_setParams(zcs, params);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
  *
@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                     const void* dict, size_t dictSize,
                           ZSTD_parameters params,
@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
 
 /*! ZSTD_initCStream_usingCDict_advanced() :
- *   This function is DEPRECATED, and is approximately equivalent to:
+ *   This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
- *     for ((fParam, value) : fParams) {
- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
- *     }
+ *     ZSTD_CCtx_setFParams(zcs, fParams);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_refCDict(zcs, cdict);
  *
@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                const ZSTD_CDict* cdict,
                                      ZSTD_frameParameters fParams,
@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
  *  This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
  *
  * note: no dictionary will be used if dict == NULL or dictSize < 8
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
 
 /*!
@@ -2330,27 +2595,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
  *     ZSTD_DCtx_refDDict(zds, ddict);
  *
  * note : ddict is referenced, it must outlive decompression session
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
 
-/*!
- * This function is deprecated, and is equivalent to:
- *
- *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
- *
- * re-use decompression parameters from previous init; saves dictionary loading
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
- */
-ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
+ *
+ * *** OVERVIEW ***
+ * The Block-Level Sequence Producer API allows users to provide their own custom
+ * sequence producer which libzstd invokes to process each block. The produced list
+ * of sequences (literals and matches) is then post-processed by libzstd to produce
+ * valid compressed blocks.
+ *
+ * This block-level offload API is a more granular complement of the existing
+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
+ * an easier migration story for applications already integrated with libzstd: the
+ * user application continues to invoke the same compression functions
+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
+ * from the specific advantages of the external sequence producer. For example,
+ * the sequence producer could be tuned to take advantage of known characteristics
+ * of the input, to offer better speed / ratio, or could leverage hardware
+ * acceleration not available within libzstd itself.
+ *
+ * See contrib/externalSequenceProducer for an example program employing the
+ * Block-Level Sequence Producer API.
+ *
+ * *** USAGE ***
+ * The user is responsible for implementing a function of type
+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
+ * arguments to the user-provided function:
+ *
+ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
+ *     producer.
+ *
+ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
+ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
+ *     backing outSeqs is managed by the CCtx.
+ *
+ *   - src, srcSize: an input buffer for the sequence producer to parse.
+ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
+ *
+ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
+ *     producer may reference as it parses the src buffer. Currently, zstd will
+ *     always pass dictSize == 0 into external sequence producers, but this will
+ *     change in the future.
+ *
+ *   - compressionLevel: a signed integer representing the zstd compression level
+ *     set by the user for the current operation. The sequence producer may choose
+ *     to use this information to change its compression strategy and speed/ratio
+ *     tradeoff. Note: the compression level does not reflect zstd parameters set
+ *     through the advanced API.
+ *
+ *   - windowSize: a size_t representing the maximum allowed offset for external
+ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
+ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
+ *     for details.
+ *
+ * The user-provided function shall return a size_t representing the number of
+ * sequences written to outSeqs. This return value will be treated as an error
+ * code if it is greater than outSeqsCapacity. The return value must be non-zero
+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
+ * for convenience, but any value greater than outSeqsCapacity will be treated as
+ * an error code.
+ *
+ * If the user-provided function does not return an error code, the sequences
+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
+ * occur if the parse is not valid. A parse is defined to be valid if the
+ * following conditions hold:
+ *   - The sum of matchLengths and literalLengths must equal srcSize.
+ *   - All sequences in the parse, except for the final sequence, must have
+ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
+ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
+ *   - All offsets must respect the windowSize parameter as specified in
+ *     doc/zstd_compression_format.md.
+ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
+ *
+ * zstd will only validate these conditions (and fail compression if they do not
+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
+ * validation has a performance cost.
+ *
+ * If the user-provided function returns an error, zstd will either fall back
+ * to an internal sequence producer or fail the compression operation. The user can
+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
+ * cParam. Fallback compression will follow any other cParam settings, such as
+ * compression level, the same as in a normal compression operation.
+ *
+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
+ * function by calling
+ *         ZSTD_registerSequenceProducer(cctx,
+ *                                       sequenceProducerState,
+ *                                       sequenceProducer)
+ * This setting will persist until the next parameter reset of the CCtx.
+ *
+ * The sequenceProducerState must be initialized by the user before calling
+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
+ * sequenceProducerState.
+ *
+ * *** LIMITATIONS ***
+ * This API is compatible with all zstd compression APIs which respect advanced parameters.
+ * However, there are three limitations:
+ *
+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
+ * external sequence producer.
+ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
+ *     cases (see its documentation for details). Users must explicitly set
+ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
+ *     sequence producer is registered.
+ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
+ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
+ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
+ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
+ *
+ * Second, history buffers are not currently supported. Concretely, zstd will always pass
+ * dictSize == 0 to the external sequence producer (for now). This has two implications:
+ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
+ *     references a dictionary, but the dictionary won't have any effect.
+ *   - Stream history is not currently supported. All advanced compression APIs, including
+ *     streaming APIs, work with external sequence producers, but each block is treated as
+ *     an independent chunk without history from previous blocks.
+ *
+ * Third, multi-threading within a single compression is not currently supported. In other words,
+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
+ * Multi-threading across compressions is fine: simply create one CCtx per thread.
+ *
+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
+ * overcoming them. It is purely a question of engineering effort.
+ */
+
+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
+
+typedef size_t ZSTD_sequenceProducer_F (
+  void* sequenceProducerState,
+  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
+  const void* src, size_t srcSize,
+  const void* dict, size_t dictSize,
+  int compressionLevel,
+  size_t windowSize
+);
+
+/*! ZSTD_registerSequenceProducer() :
+ * Instruct zstd to use a block-level external sequence producer function.
+ *
+ * The sequenceProducerState must be initialized by the caller, and the caller is
+ * responsible for managing its lifetime. This parameter is sticky across
+ * compressions. It will remain set until the user explicitly resets compression
+ * parameters.
+ *
+ * Sequence producer registration is considered to be an "advanced parameter",
+ * part of the "advanced API". This means it will only have an effect on compression
+ * APIs which respect advanced parameters, such as compress2() and compressStream2().
+ * Older compression APIs such as compressCCtx(), which predate the introduction of
+ * "advanced parameters", will ignore any external sequence producer setting.
+ *
+ * The sequence producer can be "cleared" by registering a NULL function pointer. This
+ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
+ *
+ * The user is strongly encouraged to read the full API documentation (above) before
+ * calling this function. */
+ZSTDLIB_STATIC_API void
+ZSTD_registerSequenceProducer(
+  ZSTD_CCtx* cctx,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F* sequenceProducer
+);
 
 
 /* *******************************************************************
-*  Buffer-less and synchronous inner streaming functions
+*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
 *
-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
-*  But it's also a complex one, with several restrictions, documented below.
-*  Prefer normal streaming API for an easier experience.
+*  This API is deprecated, and will be removed in a future version.
+*  It allows streaming (de)compression with user allocated buffers.
+*  However, it is hard to use, and not as well tested as the rest of
+*  our API.
+*
+*  Please use the normal streaming API instead: ZSTD_compressStream2,
+*  and ZSTD_decompressStream.
+*  If there is functionality that you need, but it doesn't provide,
+*  please open an issue on our GitHub.
 ********************************************************************* */
 
 /*
@@ -2362,7 +2785,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
   Start by initializing a context.
   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
 
   Then, consume your input using ZSTD_compressContinue().
   There are some important considerations to keep in mind when using this advanced function :
@@ -2384,18 +2806,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 */
 
 /*=====   Buffer-less streaming compression functions  =====*/
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 
+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
+ZSTDLIB_STATIC_API
+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
 /*
   Buffer-less streaming decompression (synchronous mode)
@@ -2408,8 +2840,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
   Data fragment must be large enough to ensure successful decoding.
  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
            errorCode, which can be tested using ZSTD_isError().
 
   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
@@ -2428,7 +2860,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 
   The most memory efficient way is to use a round buffer of sufficient size.
   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
-  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  which can return an error code if required value is too large for current system (in 32-bits mode).
   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
@@ -2448,7 +2880,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
 
- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
   It can also be an error code, which can be tested with ZSTD_isError().
 
@@ -2471,27 +2903,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 */
 
 /*=====   Buffer-less streaming decompression functions  =====*/
-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
-typedef struct {
-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
-    unsigned blockSizeMax;
-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
-    unsigned headerSize;
-    unsigned dictID;
-    unsigned checksumFlag;
-} ZSTD_frameHeader;
 
-/*! ZSTD_getFrameHeader() :
- *  decode Frame Header, or requires larger `srcSize`.
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
-/*! ZSTD_getFrameHeader_advanced() :
- *  same as ZSTD_getFrameHeader(),
- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
 ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
@@ -2502,6 +2914,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
 ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* misc */
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
 ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
 typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
@@ -2509,11 +2922,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 
 
 
-/* ============================ */
-/*       Block level API       */
-/* ============================ */
+/* ========================================= */
+/*       Block level API (DEPRECATED)       */
+/* ========================================= */
 
 /*!
+
+    This API is deprecated in favor of the regular compression API.
+    You can get the frame header down to 2 bytes by setting:
+      - ZSTD_c_format = ZSTD_f_zstd1_magicless
+      - ZSTD_c_contentSizeFlag = 0
+      - ZSTD_c_checksumFlag = 0
+      - ZSTD_c_dictIDFlag = 0
+
+    This API is not as well tested as our normal API, so we recommend not using it.
+    We will be removing it in a future version. If the normal API doesn't provide
+    the functionality you need, please open a GitHub issue.
+
     Block functions produce and decode raw zstd blocks, without frame metadata.
     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
@@ -2524,7 +2949,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
     - It is necessary to init context before starting
       + compression : any ZSTD_compressBegin*() variant, including with dictionary
       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
-      + copyCCtx() and copyDCtx() can be used too
     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
       + If input is larger than a block size, it's necessary to split input data into multiple blocks
       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
@@ -2541,11 +2965,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 */
 
 /*=====   Raw zstd block functions  =====*/
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
 
-
 #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
 
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
index 20f08c644b71..464c410b2768 100644
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 # ################################################################
-# Copyright (c) Facebook, Inc.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
new file mode 100644
index 000000000000..05adbbeccaa9
--- /dev/null
+++ b/lib/zstd/common/allocations.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides custom allocation primitives
+ */
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+
+#include "mem.h" /* MEM_STATIC */
+#define ZSTD_STATIC_LINKING_ONLY
+#include <linux/zstd.h> /* ZSTD_customMem */
+
+#ifndef ZSTD_ALLOCATIONS_H
+#define ZSTD_ALLOCATIONS_H
+
+/* custom memory allocation functions */
+
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
+
+#endif /* ZSTD_ALLOCATIONS_H */
diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
new file mode 100644
index 000000000000..aa3487ec4b6a
--- /dev/null
+++ b/lib/zstd/common/bits.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "mem.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+MEM_STATIC
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+MEM_STATIC
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+MEM_STATIC
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+#endif /* ZSTD_BITS_H */
diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
index feef3a1b1d60..444dc4f85c64 100644
--- a/lib/zstd/common/bitstream.h
+++ b/lib/zstd/common/bitstream.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * bitstream
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -27,6 +28,7 @@
 #include "compiler.h"       /* UNLIKELY() */
 #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */
+#include "bits.h"           /* ZSTD_highbit32 */
 
 
 /*=========================================
@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 */
 
-
-
-/*-**************************************************************
-*  Internal functions
-****************************************************************/
-MEM_STATIC unsigned BIT_highbit32 (U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
-                                                 11, 14, 16, 18, 22, 25,  3, 30,
-                                                  8, 12, 20, 28, 15, 17, 24,  7,
-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
 /*=====    Local Constants   =====*/
 static const unsigned BIT_mask[] = {
     0,          1,         3,         7,         0xF,       0x1F,
@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
     return 0;
 }
 
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
 /*! BIT_addBits() :
  *  can add up to 31 bits into `bitC`.
  *  Note : does not check for register overflow ! */
@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
     assert(nbBits < BIT_MASK_SIZE);
     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
     bitC->bitPos += nbBits;
 }
 
@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
     } else {
         bitD->ptr   = bitD->start;
@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         default: break;
         }
         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
         }
         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
 #endif
 }
 
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
-{
-    assert(nbBits < BIT_MASK_SIZE);
-    return bitContainer & BIT_mask[nbBits];
-}
-
 /*! BIT_lookBits() :
  *  Provides next n bits from local register.
  *  local register is not modified.
@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
 }
 
 /*! BIT_readBitsFast() :
- *  unsafe version; only works only if nbBits >= 1 */
+ *  unsafe version; only works if nbBits >= 1 */
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
 {
     size_t const value = BIT_lookBitsFast(bitD, nbBits);
@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
  *  This function is safe, it guarantees it will not read beyond src buffer.
  * @return : status of `BIT_DStream_t` internal register.
  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
     if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
         return BIT_DStream_overflow;
diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
index c42d39faf9bd..c437e0975575 100644
--- a/lib/zstd/common/compiler.h
+++ b/lib/zstd/common/compiler.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -179,6 +180,17 @@
 *  Sanitizer
 *****************************************************************/
 
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
+ * abundance of caution, disable our custom poisoning on mingw. */
+#ifdef __MINGW32__
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
+#endif
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
+#endif
+#endif
+
 
 
 #endif /* ZSTD_COMPILER_H */
diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
index 0db7b42407ee..d8319a2bef4c 100644
--- a/lib/zstd/common/cpu.h
+++ b/lib/zstd/common/cpu.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
index bb863c9ea616..e56ff6464e91 100644
--- a/lib/zstd/common/debug.c
+++ b/lib/zstd/common/debug.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
index 6dd88d1fbd02..da0dbfc614b8 100644
--- a/lib/zstd/common/debug.h
+++ b/lib/zstd/common/debug.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
index fef67056f052..6cdd82233fb5 100644
--- a/lib/zstd/common/entropy_common.c
+++ b/lib/zstd/common/entropy_common.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Common functions of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,8 +20,8 @@
 #include "error_private.h"       /* ERR_*, ERROR */
 #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
 #include "huf.h"
+#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
 
 
 /*===   Version   ===*/
@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
 /*-**************************************************************
 *  FSE NCount encoding-decoding
 ****************************************************************/
-static U32 FSE_ctz(U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_ctz(val);
-#   else   /* Software version */
-        U32 count = 0;
-        while ((val & 1) == 0) {
-            val >>= 1;
-            ++count;
-        }
-        return count;
-#   endif
-    }
-}
-
 FORCE_INLINE_TEMPLATE
 size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
                            const void* headerBuffer, size_t hbSize)
@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
              * repeat.
              * Avoid UB by setting the high bit to 1.
              */
-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             while (repeats >= 12) {
                 charnum += 3 * 12;
                 if (LIKELY(ip <= iend-7)) {
@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                     ip = iend - 4;
                 }
                 bitStream = MEM_readLE32(ip) >> bitCount;
-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             }
             charnum += 3 * repeats;
             bitStream >>= 2 * repeats;
@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                  * know that threshold > 1.
                  */
                 if (remaining <= 1) break;
-                nbBits = BIT_highbit32(remaining) + 1;
+                nbBits = ZSTD_highbit32(remaining) + 1;
                 threshold = 1 << (nbBits - 1);
             }
             if (charnum >= maxSV1) break;
@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      const void* src, size_t srcSize)
 {
     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
     if (weightTotal == 0) return ERROR(corruption_detected);
 
     /* get last non-null symbol weight (implied, total must be 2^n) */
-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
         *tableLogPtr = tableLog;
         /* determine last weight */
         {   U32 const total = 1 << tableLog;
             U32 const rest = total - weightTotal;
-            U32 const verif = 1 << BIT_highbit32(rest);
-            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
             huffWeight[oSize] = (BYTE)lastWeight;
             rankStats[lastWeight]++;
@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      U32* nbSymbolsPtr, U32* tableLogPtr,
                      const void* src, size_t srcSize,
                      void* workSpace, size_t wkspSize,
-                     int bmi2)
+                     int flags)
 {
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
     }
 #endif
-    (void)bmi2;
+    (void)flags;
     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
 }
diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
index 6d1135f8c373..a4062d30d170 100644
--- a/lib/zstd/common/error_private.c
+++ b/lib/zstd/common/error_private.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(version_unsupported): return "Version not supported";
     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
-    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(corruption_detected): return "Data corruption detected";
     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
     case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
     case PREFIX(init_missing): return "Context should be init first";
     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
     case PREFIX(srcSize_wrong): return "Src size is incorrect";
     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
+    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
         /* following error codes are not stable and may be removed or changed in a future version */
     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
+    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
     case PREFIX(maxCode):
     default: return notErrorCode;
     }
diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
index ca5101e542fa..9a4699a38a88 100644
--- a/lib/zstd/common/error_private.h
+++ b/lib/zstd/common/error_private.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
index 4507043b2287..c4e25a219142 100644
--- a/lib/zstd/common/fse.h
+++ b/lib/zstd/common/fse.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * FSE : Finite State Entropy codec
  * Public Prototypes declaration
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -50,34 +51,6 @@
 FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
 
 
-/*-****************************************
-*  FSE simple functions
-******************************************/
-/*! FSE_compress() :
-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
-    @return : size of compressed data (<= dstCapacity).
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
-*/
-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/*! FSE_decompress():
-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
-    @return : size of regenerated data (<= maxDstSize),
-              or an error code, which can be tested using FSE_isError() .
-
-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
-    Why ? : making this distinction requires a header.
-    Header management is intentionally delegated to the user layer, which can better manage special cases.
-*/
-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
-                               const void* cSrc, size_t cSrcSize);
-
-
 /*-*****************************************
 *  Tool functions
 ******************************************/
@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
 FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
 
 
-/*-*****************************************
-*  FSE advanced functions
-******************************************/
-/*! FSE_compress2() :
-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
-    Both parameters can be defined as '0' to mean : use default value
-    @return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
-*/
-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-
-
 /*-*****************************************
 *  FSE detailed API
 ******************************************/
@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
 typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
 
 /*! FSE_buildCTable():
     Builds `ct`, which must be already allocated, using FSE_createCTable().
@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
                            const void* rBuffer, size_t rBuffSize, int bmi2);
 
-/*! Constructor and Destructor of FSE_DTable.
-    Note that its size depends on 'tableLog' */
 typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
-
-/*! FSE_buildDTable():
-    Builds 'dt', which must be already allocated, using FSE_createDTable().
-    return : 0, or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-/*! FSE_decompress_usingDTable():
-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
-    into `dst` which must be already allocated.
-    @return : size of regenerated data (necessarily <= `dstCapacity`),
-              or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*!
 Tutorial :
@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /*< same as FSE_optimalTableLog(), which used `minus==2` */
 
-/* FSE_compress_wksp() :
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
- */
-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
-
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
-
 size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
 /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
 
@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
 FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
 
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
-
-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
-
-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
 #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
-
 size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
+/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
 
 typedef enum {
    FSE_repeat_none,  /*< Cannot use the previous table */
@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
 
 /* FSE_getMaxNbBits() :
  * Approximate maximum cost of a symbol, in bits.
- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
 MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
index a0d06095be83..45cf457f31ef 100644
--- a/lib/zstd/common/fse_decompress.c
+++ b/lib/zstd/common/fse_decompress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy decoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -24,6 +25,7 @@
 #include "error_private.h"
 #define ZSTD_DEPS_NEED_MALLOC
 #include "zstd_deps.h"
+#include "bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -55,19 +57,6 @@
 #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
 #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
 
-
-/* Function templates */
-FSE_DTable* FSE_createDTable (unsigned tableLog)
-{
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
-}
-
-void FSE_freeDTable (FSE_DTable* dt)
-{
-    ZSTD_free(dt);
-}
-
 static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
 {
     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
         for (u=0; u<tableSize; u++) {
             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
     }   }
 
@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
 /*-*******************************************************
 *  Decompression (Byte symbols)
 *********************************************************/
-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
-
-    DTableH->tableLog = 0;
-    DTableH->fastMode = 0;
-
-    cell->newState = 0;
-    cell->symbol = symbolValue;
-    cell->nbBits = 0;
-
-    return 0;
-}
-
-
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSV1 = tableMask+1;
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
-
-    /* Build Decoding Table */
-    DTableH->tableLog = (U16)nbBits;
-    DTableH->fastMode = 1;
-    for (s=0; s<maxSV1; s++) {
-        dinfo[s].newState = 0;
-        dinfo[s].symbol = (BYTE)s;
-        dinfo[s].nbBits = (BYTE)nbBits;
-    }
-
-    return 0;
-}
 
 FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
           void* dst, size_t maxDstSize,
@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
     return op-ostart;
 }
 
-
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt)
-{
-    const void* ptr = dt;
-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
-    const U32 fastMode = DTableH->fastMode;
-
-    /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
-}
-
-
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
-{
-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
 typedef struct {
     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
     FSE_DTable dtable[1]; /* Dynamically sized */
@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
     }
 
     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
 
     CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
 }
 
-
-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
-
-
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
index 5042ff870308..8e7943092ed1 100644
--- a/lib/zstd/common/huf.h
+++ b/lib/zstd/common/huf.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * huff0 huffman codec,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -18,99 +19,22 @@
 
 /* *** Dependencies *** */
 #include "zstd_deps.h"    /* size_t */
-
-
-/* *** library symbols visibility *** */
-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
- *        HUF symbols remain "private" (internal symbols for library only).
- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
-#  define HUF_PUBLIC_API __declspec(dllexport)
-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
-#else
-#  define HUF_PUBLIC_API
-#endif
-
-
-/* ========================== */
-/* ***  simple functions  *** */
-/* ========================== */
-
-/* HUF_compress() :
- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
- * 'dst' buffer must be already allocated.
- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
- * @return : size of compressed data (<= `dstCapacity`).
- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
- */
-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/* HUF_decompress() :
- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
- *  Note : in contrast with FSE, HUF_decompress can regenerate
- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
- *         because it knows size to regenerate (originalSize).
- * @return : size of regenerated data (== originalSize),
- *           or an error code, which can be tested using HUF_isError()
- */
-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
-                               const void* cSrc, size_t cSrcSize);
+#include "mem.h"          /* U32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
 
 
 /* ***   Tool functions *** */
-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
 
 /* Error Management */
-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
 
 
-/* ***   Advanced function   *** */
-
-/* HUF_compress2() :
- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
-                               const void* src, size_t srcSize,
-                               unsigned maxSymbolValue, unsigned tableLog);
-
-/* HUF_compress4X_wksp() :
- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
 #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
 #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
-                                     const void* src, size_t srcSize,
-                                     unsigned maxSymbolValue, unsigned tableLog,
-                                     void* workSpace, size_t wkspSize);
-
-#endif   /* HUF_H_298734234 */
-
-/* ******************************************************************
- *  WARNING !!
- *  The following section contains advanced and experimental definitions
- *  which shall never be used in the context of a dynamic library,
- *  because they are not guaranteed to remain stable in the future.
- *  Only consider them in association with static linking.
- * *****************************************************************/
-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
-#define HUF_H_HUF_STATIC_LINKING_ONLY
-
-/* *** Dependencies *** */
-#include "mem.h"   /* U32 */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-
 
 /* *** Constants *** */
 #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-#endif
 
-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+/*
+ * Huffman flags bitset.
+ * For all flags, 0 is the default value.
+ */
+typedef enum {
+    /*
+     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
+     * Otherwise: Ignored.
+     */
+    HUF_flags_bmi2 = (1 << 0),
+    /*
+     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
+     * If unset: Use heuristic to find the table depth.
+     */
+    HUF_flags_optimalDepth = (1 << 1),
+    /*
+     * If set: If the previous table can encode the input, always reuse the previous table.
+     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
+     */
+    HUF_flags_preferRepeat = (1 << 2),
+    /*
+     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
+     * If unset: Always histogram the entire input.
+     */
+    HUF_flags_suspectUncompressible = (1 << 3),
+    /*
+     * If set: Don't use assembly implementations
+     * If unset: Allow using assembly implementations
+     */
+    HUF_flags_disableAsm = (1 << 4),
+    /*
+     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
+     * If unset: Use the fast decoding loop when possible.
+     */
+    HUF_flags_disableFast = (1 << 5)
+} HUF_flags_e;
 
 
 /* ****************************************
  *  HUF detailed API
  * ****************************************/
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
 
 /*! HUF_compress() does the following:
  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
  *  For example, it's possible to compress several blocks using the same 'CTable',
  *  or to save and regenerate 'CTable' using external methods.
  */
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+unsigned HUF_minTableLog(unsigned symbolCardinality);
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 
@@ -196,6 +144,7 @@ typedef enum {
    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
  } HUF_repeat;
+
 /* HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
  */
-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
 size_t HUF_buildCTable_wksp (HUF_CElt* tree,
                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                           const void* src, size_t srcSize,
                           void* workspace, size_t wkspSize,
-                          int bmi2);
+                          int flags);
 
 /* HUF_readCTable() :
  *  Loading a CTable saved with HUF_writeCTable() */
@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-
 
 /* ====================== */
 /* single stream variants */
 /* ====================== */
 
-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 /* HUF_compress1X_repeat() :
  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
-
-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
-#endif
-
-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
 #endif
 
 /* BMI2 variants.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 
-#endif /* HUF_STATIC_LINKING_ONLY */
+#endif   /* HUF_H_298734234 */
 
diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
index 1d9cc03924ca..a7231822b6e3 100644
--- a/lib/zstd/common/mem.h
+++ b/lib/zstd/common/mem.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
index 0e3b2c0a527d..7ede8cf1ffe5 100644
--- a/lib/zstd/common/portability_macros.h
+++ b/lib/zstd/common/portability_macros.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -12,7 +13,7 @@
 #define ZSTD_PORTABILITY_MACROS_H
 
 /*
- * This header file contains macro defintions to support portability.
+ * This header file contains macro definitions to support portability.
  * This header is shared between C and ASM code, so it MUST only
  * contain macro definitions. It MUST not contain any C code.
  *
@@ -65,7 +66,7 @@
 #endif
 
 /*
- * Only enable assembly for GNUC comptabile compilers,
+ * Only enable assembly for GNUC compatible compilers,
  * because other platforms may not support GAS assembly syntax.
  *
  * Only enable assembly for Linux / MacOS, other platforms may
@@ -90,4 +91,23 @@
  */
 #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
 
+/*
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
+ * assembly sources when CET is enabled.
+ *
+ * Additionally, any function that may be called indirectly must begin
+ * with ZSTD_CET_ENDBRANCH.
+ */
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+#  include <cet.h>
+#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
+# endif
+#endif
+
+#ifndef ZSTD_CET_ENDBRANCH
+# define ZSTD_CET_ENDBRANCH
+#endif
+
 #endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
index 3d7e35b309b5..44b95b25344a 100644
--- a/lib/zstd/common/zstd_common.c
+++ b/lib/zstd/common/zstd_common.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,6 @@
 *  Dependencies
 ***************************************/
 #define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
 #include "error_private.h"
 #include "zstd_internal.h"
 
@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
 /*! ZSTD_getErrorString() :
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
-
-
-
-/*=**************************************************************
-*  Custom allocator
-****************************************************************/
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc)
-        return customMem.customAlloc(customMem.opaque, size);
-    return ZSTD_malloc(size);
-}
-
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc) {
-        /* calloc implemented as malloc+memset;
-         * not as efficient as calloc, but next best guess for custom malloc */
-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
-        ZSTD_memset(ptr, 0, size);
-        return ptr;
-    }
-    return ZSTD_calloc(1, size);
-}
-
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
-{
-    if (ptr!=NULL) {
-        if (customMem.customFree)
-            customMem.customFree(customMem.opaque, ptr);
-        else
-            ZSTD_free(ptr);
-    }
-}
diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
index f06df065dec0..670c5fa2a952 100644
--- a/lib/zstd/common/zstd_deps.h
+++ b/lib/zstd/common/zstd_deps.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
index 93305d9b41bb..7f023e4d4774 100644
--- a/lib/zstd/common/zstd_internal.h
+++ b/lib/zstd/common/zstd_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -28,7 +29,6 @@
 #include <linux/zstd.h>
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "huf.h"
 #include <linux/xxhash.h>                /* XXH_reset, update, digest */
 #define ZSTD_TRACE 0
@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
 #define ZSTD_FRAMECHECKSUMSIZE 4
 
 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+#define MIN_LITERALS_FOR_4_STREAMS 6
 
-#define HufLog 12
 typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
 
 #define LONGNBSEQ 0x7F00
@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define MINMATCH 3
 
 #define Litbits  8
+#define LitHufLog 11
 #define MaxLit ((1<<Litbits) - 1)
 #define MaxML   52
 #define MaxLL   35
@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define LLFSELog    9
 #define OffFSELog   8
 #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+#define MaxMLBits 16
+#define MaxLLBits 16
 
 #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
 /* Each table cannot take more than #symbols * FSELog bits */
@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
          * one COPY16() in the first call. Then, do two calls per loop since
          * at that point it is more likely to have a high trip count.
          */
-#ifdef __aarch64__
-        do {
-            COPY16(op, ip);
-        }
-        while (op < oend);
-#else
         ZSTD_copy16(op, ip);
         if (16 >= length) return;
         op += 16;
@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
             COPY16(op, ip);
         }
         while (op < oend);
-#endif
     }
 }
 
@@ -289,11 +285,11 @@ typedef enum {
 typedef struct {
     seqDef* sequencesStart;
     seqDef* sequences;      /* ptr to end of sequences */
-    BYTE* litStart;
-    BYTE* lit;              /* ptr to end of literals */
-    BYTE* llCode;
-    BYTE* mlCode;
-    BYTE* ofCode;
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
     size_t maxNbSeq;
     size_t maxNbLit;
 
@@ -301,8 +297,8 @@ typedef struct {
      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
      * the existing value of the litLength or matchLength by 0x10000.
      */
-    ZSTD_longLengthType_e   longLengthType;
-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
 } seqStore_t;
 
 typedef struct {
@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
     seqLen.matchLength = seq->mlBase + MINMATCH;
     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
-            seqLen.litLength += 0xFFFF;
+            seqLen.litLength += 0x10000;
         }
         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
-            seqLen.matchLength += 0xFFFF;
+            seqLen.matchLength += 0x10000;
         }
     }
     return seqLen;
@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
  */
 typedef struct {
+    size_t nbBlocks;
     size_t compressedSize;
     unsigned long long decompressedBound;
 } ZSTD_frameSizeInfo;   /* decompress & legacy */
 
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
-
-/* custom memory allocation functions */
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
-
-
-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
-/*
- * Counts the number of trailing zeros of a `size_t`.
- * Most compilers should support CTZ as a builtin. A backup
- * implementation is provided if the builtin isn't supported, but
- * it may not be terribly efficient.
- */
-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
-{
-    if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return __builtin_ctzll((U64)val);
-#       else
-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
-                                                      4, 25, 14, 28,  9, 34, 20, 56,
-                                                      5, 17, 26, 54, 15, 41, 29, 43,
-                                                      10, 31, 38, 35, 21, 45, 49, 57,
-                                                      63,  6, 12, 18, 24, 27, 33, 55,
-                                                      16, 53, 40, 42, 30, 37, 44, 48,
-                                                      62, 11, 23, 32, 52, 39, 36, 47,
-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-    } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return __builtin_ctz((U32)val);
-#       else
-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
-                                                     30, 22, 20, 15, 25, 17,  4,  8,
-                                                     31, 27, 13, 23, 21, 19, 16,  7,
-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-    }
-}
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
 
 
 /* ZSTD_invalidateRepCodes() :
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
index d9a76112ec3a..6ab8be6532ef 100644
--- a/lib/zstd/compress/clevels.h
+++ b/lib/zstd/compress/clevels.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
index ec5b1ca6d71a..e46ca6621b48 100644
--- a/lib/zstd/compress/fse_compress.c
+++ b/lib/zstd/compress/fse_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy encoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,6 +27,7 @@
 #define ZSTD_DEPS_NEED_MALLOC
 #define ZSTD_DEPS_NEED_MATH64
 #include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+#include "../common/bits.h" /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
 
      #ifdef __clang_analyzer__
      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
                 break;
             default :
                 assert(normalizedCounter[s] > 1);
-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 *  FSE Compression Code
 ****************************************************************/
 
-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t size;
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return (FSE_CTable*)ZSTD_malloc(size);
-}
-
-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
-
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
     return minBits;
@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
 {
-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
     U32 tableLog = maxTableLog;
     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     return tableLog;
 }
 
-
-/* fake FSE_CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
-{
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    void* const ptr = ct;
-    U16* const tableU16 = ( (U16*) ptr) + 2;
-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
-
-    /* header */
-    tableU16[-2] = (U16) nbBits;
-    tableU16[-1] = (U16) maxSymbolValue;
-
-    /* Build table */
-    for (s=0; s<tableSize; s++)
-        tableU16[s] = (U16)(tableSize + s);
-
-    /* Build Symbol Transformation Table */
-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
-        for (s=0; s<=maxSymbolValue; s++) {
-            symbolTT[s].deltaNbBits = deltaNbBits;
-            symbolTT[s].deltaFindState = s-1;
-    }   }
-
-    return 0;
-}
-
 /* fake FSE_CTable, for rle input (always same symbol) */
 size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 {
@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
 
 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
 
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
index 3ddc6dfb6894..0b12587cc14b 100644
--- a/lib/zstd/compress/hist.c
+++ b/lib/zstd/compress/hist.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
index fc1830abc9c6..f7687b0fc20a 100644
--- a/lib/zstd/compress/hist.h
+++ b/lib/zstd/compress/hist.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
index 74ef0db47621..83241abafe35 100644
--- a/lib/zstd/compress/huf_compress.c
+++ b/lib/zstd/compress/huf_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Huffman encoder, part of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,9 +27,9 @@
 #include "hist.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "../common/fse.h"        /* header compression */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
+#include "../common/bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -39,13 +40,67 @@
 
 
 /* **************************************************************
-*  Utils
+*  Required declarations
 ****************************************************************/
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+
+#if DEBUGLEVEL >= 2
+
+static size_t showU32(const U32* arr, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", arr[u]); (void)arr;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt);
+
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
 {
-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
+    }
+    RAWLOG(6, " \n");
+    return size;
+
 }
 
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+#endif
+
 
 /* *******************************************************
 *  HUF : Huffman block compression
@@ -86,7 +141,10 @@ typedef struct {
     S16 norm[HUF_TABLELOG_MAX+1];
 } HUF_CompressWeightsWksp;
 
-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
+static size_t
+HUF_compressWeights(void* dst, size_t dstSize,
+              const void* weightTable, size_t wtSize,
+                    void* workspace, size_t workspaceSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
 
 static size_t HUF_getValue(HUF_CElt elt)
 {
-    return elt & ~0xFF;
+    return elt & ~(size_t)0xFF;
 }
 
 static size_t HUF_getValueFast(HUF_CElt elt)
@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     U32 n;
     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
+
     /* check conditions */
     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     return ((maxSymbolValue+1)/2) + 1;
 }
 
-/*! HUF_writeCTable() :
-    `CTable` : Huffman tree to save, using huf representation.
-    @return : size of saved CTable */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
-{
-    HUF_WriteCTableWksp wksp;
-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
-}
-
 
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
 {
@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
 
 U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
 {
-    const HUF_CElt* ct = CTable + 1;
+    const HUF_CElt* const ct = CTable + 1;
     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
     return (U32)HUF_getNbBits(ct[symbolValue]);
 }
 
 
-typedef struct nodeElt_s {
-    U32 count;
-    U16 parent;
-    BYTE byte;
-    BYTE nbBits;
-} nodeElt;
-
 /*
  * HUF_setMaxHeight():
- * Enforces maxNbBits on the Huffman tree described in huffNode.
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
  *
- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
- * the tree to so that it is a valid canonical Huffman tree.
+ * It attempts to convert all nodes with nbBits > @targetNbBits
+ * to employ @targetNbBits instead. Then it adjusts the tree
+ * so that it remains a valid canonical Huffman tree.
  *
  * @pre               The sum of the ranks of each symbol == 2^largestBits,
  *                    where largestBits == huffNode[lastNonNull].nbBits.
  * @post              The sum of the ranks of each symbol == 2^largestBits,
- *                    where largestBits is the return value <= maxNbBits.
+ *                    where largestBits is the return value (expected <= targetNbBits).
  *
- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
+ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
+ *                    It's presumed sorted, from most frequent to rarest symbol.
  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
+ * @param targetNbBits  The allowed number of bits, which the Huffman tree
  *                    may not respect. After this function the Huffman tree will
- *                    respect maxNbBits.
- * @return            The maximum number of bits of the Huffman tree after adjustment,
- *                    necessarily no more than maxNbBits.
+ *                    respect targetNbBits.
+ * @return            The maximum number of bits of the Huffman tree after adjustment.
  */
-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
 {
     const U32 largestBits = huffNode[lastNonNull].nbBits;
-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
-    if (largestBits <= maxNbBits) return largestBits;
+    /* early exit : no elt > targetNbBits, so the tree is already valid. */
+    if (largestBits <= targetNbBits) return largestBits;
+
+    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
 
     /* there are several too large elements (at least >= 2) */
     {   int totalCost = 0;
-        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        const U32 baseCost = 1 << (largestBits - targetNbBits);
         int n = (int)lastNonNull;
 
-        /* Adjust any ranks > maxNbBits to maxNbBits.
+        /* Adjust any ranks > targetNbBits to targetNbBits.
          * Compute totalCost, which is how far the sum of the ranks is
          * we are over 2^largestBits after adjust the offending ranks.
          */
-        while (huffNode[n].nbBits > maxNbBits) {
+        while (huffNode[n].nbBits > targetNbBits) {
             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
-            huffNode[n].nbBits = (BYTE)maxNbBits;
+            huffNode[n].nbBits = (BYTE)targetNbBits;
             n--;
         }
-        /* n stops at huffNode[n].nbBits <= maxNbBits */
-        assert(huffNode[n].nbBits <= maxNbBits);
-        /* n end at index of smallest symbol using < maxNbBits */
-        while (huffNode[n].nbBits == maxNbBits) --n;
+        /* n stops at huffNode[n].nbBits <= targetNbBits */
+        assert(huffNode[n].nbBits <= targetNbBits);
+        /* n end at index of smallest symbol using < targetNbBits */
+        while (huffNode[n].nbBits == targetNbBits) --n;
 
-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
+        /* renorm totalCost from 2^largestBits to 2^targetNbBits
          * note : totalCost is necessarily a multiple of baseCost */
-        assert((totalCost & (baseCost - 1)) == 0);
-        totalCost >>= (largestBits - maxNbBits);
+        assert(((U32)totalCost & (baseCost - 1)) == 0);
+        totalCost >>= (largestBits - targetNbBits);
         assert(totalCost > 0);
 
         /* repay normalized cost */
@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 
             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
-            {   U32 currentNbBits = maxNbBits;
+            {   U32 currentNbBits = targetNbBits;
                 int pos;
                 for (pos=n ; pos >= 0; pos--) {
                     if (huffNode[pos].nbBits >= currentNbBits) continue;
-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
+                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
+                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
             }   }
 
             while (totalCost > 0) {
                 /* Try to reduce the next power of 2 above totalCost because we
                  * gain back half the rank.
                  */
-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
+                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
                     U32 const highPos = rankLast[nBitsToDecrease];
                     U32 const lowPos = rankLast[nBitsToDecrease-1];
@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                     rankLast[nBitsToDecrease] = noSymbol;
                 else {
                     rankLast[nBitsToDecrease]--;
-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
                 }
             }   /* while (totalCost > 0) */
@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
              * TODO.
              */
             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
-                /* special case : no rank 1 symbol (using maxNbBits-1);
-                 * let's create one from largest rank 0 (using maxNbBits).
+                /* special case : no rank 1 symbol (using targetNbBits-1);
+                 * let's create one from largest rank 0 (using targetNbBits).
                  */
                 if (rankLast[1] == noSymbol) {
-                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    while (huffNode[n].nbBits == targetNbBits) n--;
                     huffNode[n+1].nbBits--;
                     assert(n >= 0);
                     rankLast[1] = (U32)(n+1);
@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
         }   /* repay normalized cost */
     }   /* there are several too large elements (at least >= 2) */
 
-    return maxNbBits;
+    return targetNbBits;
 }
 
 typedef struct {
@@ -429,7 +475,7 @@ typedef struct {
     U16 curr;
 } rankPos;
 
-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
 
 /* Number of buckets available for HUF_sort() */
 #define RANK_POSITION_TABLE_SIZE 192
@@ -448,8 +494,8 @@ typedef struct {
  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
  */
 #define RANK_POSITION_MAX_COUNT_LOG 32
-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
 
 /* Return the appropriate bucket index for a given count. See definition of
  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
@@ -457,7 +503,7 @@ typedef struct {
 static U32 HUF_getIndex(U32 const count) {
     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
         ? count
-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
 }
 
 /* Helper swap function for HUF_quickSortPartition() */
@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
 
     /* Sort each bucket. */
     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
+        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
         U32 const bucketStartIdx = rankPosition[n].base;
         if (bucketSize > 1) {
             assert(bucketStartIdx < maxSymbolValue1);
@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
     assert(HUF_isSorted(huffNode, maxSymbolValue1));
 }
 
+
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     int lowS, lowN;
     int nodeNb = STARTNODE;
     int n, nodeRoot;
+    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
     /* init for parents */
     nonNullRank = (int)maxSymbolValue;
     while(huffNode[nonNullRank].count == 0) nonNullRank--;
@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     for (n=0; n<=nonNullRank; n++)
         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
 
+    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
+
     return nonNullRank;
 }
 
@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
     CTable[0] = maxNbBits;
 }
 
-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+size_t
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                     void* workSpace, size_t wkspSize)
 {
-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+    HUF_buildCTable_wksp_tables* const wksp_tables =
+        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
     nodeElt* const huffNode = huffNode0+1;
     int nonNullRank;
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
+
+    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
+
     /* safety checks */
     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
-      return ERROR(workSpace_tooSmall);
+        return ERROR(workSpace_tooSmall);
     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-      return ERROR(maxSymbolValue_tooLarge);
+        return ERROR(maxSymbolValue_tooLarge);
     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
 
     /* sort, decreasing order */
     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
 
     /* build tree */
     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
 
-    /* enforce maxTableLog */
+    /* determine and enforce maxTableLog */
     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
 
@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
 #if DEBUGLEVEL >= 1
     {
         size_t const nbBits = HUF_getNbBits(elt);
-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
+        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
         (void)dirtyBits;
         /* Middle bits are 0. */
         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
     {
         size_t const nbBits = bitC->bitPos[0] & 0xFF;
         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
+        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
     }
 }
 
@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
     }
     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    (void)bmi2;
+    (void)flags;
     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
 }
 
 #endif
 
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
-{
-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 static size_t
 HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, int bmi2)
+                              const HUF_CElt* CTable, int flags)
 {
     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
     const BYTE* ip = (const BYTE*) src;
@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     op += 6;   /* jumpTable */
 
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart, (U16)cSize);
         op += cSize;
@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+2, (U16)cSize);
         op += cSize;
@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+4, (U16)cSize);
         op += cSize;
@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     ip += segmentSize;
     assert(op <= oend);
     assert(ip <= iend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         op += cSize;
     }
@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     return (size_t)(op-ostart);
 }
 
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
-{
-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
 static size_t HUF_compressCTable_internal(
                 BYTE* const ostart, BYTE* op, BYTE* const oend,
                 const void* src, size_t srcSize,
-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
 {
     size_t const cSize = (nbStreams==HUF_singleStream) ?
-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
     if (HUF_isError(cSize)) { return cSize; }
     if (cSize==0) { return 0; }   /* uncompressible */
     op += cSize;
@@ -1168,6 +1216,79 @@ typedef struct {
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
 
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
+{
+    unsigned cardinality = 0;
+    unsigned i;
+
+    for (i = 0; i < maxSymbolValue + 1; i++) {
+        if (count[i] != 0) cardinality += 1;
+    }
+
+    return cardinality;
+}
+
+unsigned HUF_minTableLog(unsigned symbolCardinality)
+{
+    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
+    return minBitsSymbols;
+}
+
+unsigned HUF_optimalTableLog(
+            unsigned maxTableLog,
+            size_t srcSize,
+            unsigned maxSymbolValue,
+            void* workSpace, size_t wkspSize,
+            HUF_CElt* table,
+      const unsigned* count,
+            int flags)
+{
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
+
+    if (!(flags & HUF_flags_optimalDepth)) {
+        /* cheap evaluation, based on FSE */
+        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    }
+
+    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
+        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
+        size_t maxBits, hSize, newSize;
+        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
+        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
+        size_t optSize = ((size_t) ~0) - 1;
+        unsigned optLog = maxTableLog, optLogGuess;
+
+        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
+
+        /* Search until size increases */
+        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
+            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
+            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
+            if (ERR_isError(maxBits)) continue;
+
+            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
+
+            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
+
+            if (ERR_isError(hSize)) continue;
+
+            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+
+            if (newSize > optSize + 1) {
+                break;
+            }
+
+            if (newSize < optSize) {
+                optSize = newSize;
+                optLog = optLogGuess;
+            }
+        }
+        assert(optLog <= HUF_TABLELOG_MAX);
+        return optLog;
+    }
+}
+
 /* HUF_compress_internal() :
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
                        unsigned maxSymbolValue, unsigned huffLog,
                        HUF_nbStreams_e nbStreams,
                        void* workSpace, size_t wkspSize,
-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
-                 const int bmi2, unsigned suspectUncompressible)
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
 {
     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
     BYTE* const ostart = (BYTE*)dst;
     BYTE* const oend = ostart + dstSize;
     BYTE* op = ostart;
 
+    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
 
     /* checks & inits */
@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
 
     /* Heuristic : If old table is valid, use it for small inputs */
-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* If uncompressible data is suspected, do a smaller sampling first */
     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
         size_t largestTotal = 0;
+        DEBUGLOG(5, "input suspected incompressible : sampling to check");
         {   unsigned maxSymbolValueBegin = maxSymbolValue;
             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
             largestTotal += largestBegin;
@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
     }
+    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
 
     /* Check validity of previous table */
     if ( repeat
@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
         *repeat = HUF_repeat_none;
     }
     /* Heuristic : use existing table for small inputs */
-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* Build Huffman Tree */
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
                                             maxSymbolValue, huffLog,
                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
         CHECK_F(maxBits);
         huffLog = (U32)maxBits;
+        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
     }
     /* Zero unused symbols in CTable, so we can check it for validity */
     {
@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
                 return HUF_compressCTable_internal(ostart, op, oend,
                                                    src, srcSize,
-                                                   nbStreams, oldHufTable, bmi2);
+                                                   nbStreams, oldHufTable, flags);
         }   }
 
         /* Use the new huffman table */
@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
     }
     return HUF_compressCTable_internal(ostart, op, oend,
                                        src, srcSize,
-                                       nbStreams, table->CTable, bmi2);
-}
-
-
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_singleStream,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                       nbStreams, table->CTable, flags);
 }
 
 size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
-                      int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_singleStream,
                                  workSpace, wkspSize, hufTable,
-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
-}
-
-/* HUF_compress4X_repeat():
- * compress input using 4 streams.
- * provide workspace to generate compression tables */
-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_fourStreams,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                 repeat, flags);
 }
 
 /* HUF_compress4X_repeat():
@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_fourStreams,
                                  workSpace, wkspSize,
-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
+                                 hufTable, repeat, flags);
 }
-
diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
index f620cafca633..c1c316e9e289 100644
--- a/lib/zstd/compress/zstd_compress.c
+++ b/lib/zstd/compress/zstd_compress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,12 +12,12 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
 #include "../common/mem.h"
 #include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_compress_internal.h"
 #include "zstd_compress_sequences.h"
@@ -27,6 +28,7 @@
 #include "zstd_opt.h"
 #include "zstd_ldm.h"
 #include "zstd_compress_superblock.h"
+#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
 
 /* ***************************************************************
 *  Tuning parameters
@@ -55,14 +57,17 @@
 *  Helper functions
 ***************************************/
 /* ZSTD_compressBound()
- * Note that the result from this function is only compatible with the "normal"
- * full-block strategy.
- * When there are a lot of small blocks due to frequent flush in streaming mode
- * the overhead of headers can make the compressed data to be larger than the
- * return value of ZSTD_compressBound().
+ * Note that the result from this function is only valid for
+ * the one-pass compression functions.
+ * When employing the streaming mode,
+ * if flushes are frequently altering the size of blocks,
+ * the overhead from block headers can make the compressed data larger
+ * than the return value of ZSTD_compressBound().
  */
 size_t ZSTD_compressBound(size_t srcSize) {
-    return ZSTD_COMPRESSBOUND(srcSize);
+    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
+    if (r==0) return ERROR(srcSize_wrong);
+    return r;
 }
 
 
@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
     if (cctx==NULL) return 0;   /* support free on NULL */
     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                     "not compatible with static CCtx");
-    {
-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
         ZSTD_freeCCtxContent(cctx);
-        if (!cctxInWorkspace) {
-            ZSTD_customFree(cctx, cctx->customMem);
-        }
+        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
     }
     return 0;
 }
@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
 }
 
-/* Returns 1 if compression parameters are such that we should
+/* Returns ZSTD_ps_enable if compression parameters are such that we should
  * enable long distance matching (wlog >= 27, strategy >= btopt).
- * Returns 0 otherwise.
+ * Returns ZSTD_ps_disable otherwise.
  */
 static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
                                  const ZSTD_compressionParameters* const cParams) {
@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
 }
 
+static int ZSTD_resolveExternalSequenceValidation(int mode) {
+    return mode;
+}
+
+/* Resolves maxBlockSize to the default if no value is present. */
+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
+    if (maxBlockSize == 0) {
+        return ZSTD_BLOCKSIZE_MAX;
+    } else {
+        return maxBlockSize;
+    }
+}
+
+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
+    if (value != ZSTD_ps_auto) return value;
+    if (cLevel < 10) {
+        return ZSTD_ps_disable;
+    } else {
+        return ZSTD_ps_enable;
+    }
+}
+
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         ZSTD_compressionParameters cParams)
 {
@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
     }
     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
+    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
+    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
+                                                                             cctxParams.compressionLevel);
     assert(!ZSTD_checkCParams(cParams));
     return cctxParams;
 }
@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
 #define ZSTD_NO_CLEVEL 0
 
 /*
- * Initializes the cctxParams from params and compressionLevel.
+ * Initializes `cctxParams` from `params` and `compressionLevel`.
  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
  */
-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
+static void
+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
+                        const ZSTD_parameters* params,
+                              int compressionLevel)
 {
     assert(!ZSTD_checkCParams(params->cParams));
     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
+    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
+    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
 }
@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
 
 /*
  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
- * @param param Validated zstd parameters.
+ * @param params Validated zstd parameters.
  */
 static void ZSTD_CCtxParams_setZstdParams(
         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         return bounds;
 
     case ZSTD_c_enableLongDistanceMatching:
-        bounds.lowerBound = 0;
-        bounds.upperBound = 1;
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
         return bounds;
 
     case ZSTD_c_ldmHashLog:
@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         bounds.upperBound = 1;
         return bounds;
 
+    case ZSTD_c_prefetchCDictTables:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_maxBlockSize:
+        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
     default:
         bounds.error = ERROR(parameter_unsupported);
         return bounds;
@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
     default:
         return 0;
     }
@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
         if (ZSTD_isUpdateAuthorized(param)) {
             cctx->cParamsChanged = 1;
         } else {
-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
     }   }
 
     switch(param)
@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
         break;
 
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     case ZSTD_c_minMatch :
         if (value!=0)   /* 0 => use default */
             BOUNDCHECK(ZSTD_c_minMatch, value);
-        CCtxParams->cParams.minMatch = value;
+        CCtxParams->cParams.minMatch = (U32)value;
         return CCtxParams->cParams.minMatch;
 
     case ZSTD_c_targetLength :
         BOUNDCHECK(ZSTD_c_targetLength, value);
-        CCtxParams->cParams.targetLength = value;
+        CCtxParams->cParams.targetLength = (U32)value;
         return CCtxParams->cParams.targetLength;
 
     case ZSTD_c_strategy :
@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         /* Content size written in frame header _when known_ (default:1) */
         DEBUGLOG(4, "set content size flag = %u", (value!=0));
         CCtxParams->fParams.contentSizeFlag = value != 0;
-        return CCtxParams->fParams.contentSizeFlag;
+        return (size_t)CCtxParams->fParams.contentSizeFlag;
 
     case ZSTD_c_checksumFlag :
         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
         CCtxParams->fParams.checksumFlag = value != 0;
-        return CCtxParams->fParams.checksumFlag;
+        return (size_t)CCtxParams->fParams.checksumFlag;
 
     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_forceMaxWindow :
         CCtxParams->forceWindow = (value != 0);
-        return CCtxParams->forceWindow;
+        return (size_t)CCtxParams->forceWindow;
 
     case ZSTD_c_forceAttachDict : {
         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
         CCtxParams->attachDictPref = pref;
         return CCtxParams->attachDictPref;
     }
 
     case ZSTD_c_literalCompressionMode : {
         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
         CCtxParams->literalCompressionMode = lcm;
         return CCtxParams->literalCompressionMode;
     }
@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_enableDedicatedDictSearch :
         CCtxParams->enableDedicatedDictSearch = (value!=0);
-        return CCtxParams->enableDedicatedDictSearch;
+        return (size_t)CCtxParams->enableDedicatedDictSearch;
 
     case ZSTD_c_enableLongDistanceMatching :
+        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
         return CCtxParams->ldmParams.enableLdm;
 
     case ZSTD_c_ldmHashLog :
         if (value!=0)   /* 0 ==> auto */
             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
-        CCtxParams->ldmParams.hashLog = value;
+        CCtxParams->ldmParams.hashLog = (U32)value;
         return CCtxParams->ldmParams.hashLog;
 
     case ZSTD_c_ldmMinMatch :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
-        CCtxParams->ldmParams.minMatchLength = value;
+        CCtxParams->ldmParams.minMatchLength = (U32)value;
         return CCtxParams->ldmParams.minMatchLength;
 
     case ZSTD_c_ldmBucketSizeLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
-        CCtxParams->ldmParams.bucketSizeLog = value;
+        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
         return CCtxParams->ldmParams.bucketSizeLog;
 
     case ZSTD_c_ldmHashRateLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
-        CCtxParams->ldmParams.hashRateLog = value;
+        CCtxParams->ldmParams.hashRateLog = (U32)value;
         return CCtxParams->ldmParams.hashRateLog;
 
     case ZSTD_c_targetCBlockSize :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
-        CCtxParams->targetCBlockSize = value;
+        CCtxParams->targetCBlockSize = (U32)value;
         return CCtxParams->targetCBlockSize;
 
     case ZSTD_c_srcSizeHint :
         if (value!=0)    /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
         CCtxParams->srcSizeHint = value;
-        return CCtxParams->srcSizeHint;
+        return (size_t)CCtxParams->srcSizeHint;
 
     case ZSTD_c_stableInBuffer:
         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         CCtxParams->deterministicRefPrefix = !!value;
         return CCtxParams->deterministicRefPrefix;
 
+    case ZSTD_c_prefetchCDictTables:
+        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
+        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->prefetchCDictTables;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
+        CCtxParams->enableMatchFinderFallback = value;
+        return CCtxParams->enableMatchFinderFallback;
+
+    case ZSTD_c_maxBlockSize:
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
+        CCtxParams->maxBlockSize = value;
+        return CCtxParams->maxBlockSize;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
+        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->searchForExternalRepcodes;
+
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
 }
@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
     case ZSTD_c_deterministicRefPrefix:
         *value = (int)CCtxParams->deterministicRefPrefix;
         break;
+    case ZSTD_c_prefetchCDictTables:
+        *value = (int)CCtxParams->prefetchCDictTables;
+        break;
+    case ZSTD_c_enableSeqProducerFallback:
+        *value = CCtxParams->enableMatchFinderFallback;
+        break;
+    case ZSTD_c_maxBlockSize:
+        *value = (int)CCtxParams->maxBlockSize;
+        break;
+    case ZSTD_c_searchForExternalRepcodes:
+        *value = (int)CCtxParams->searchForExternalRepcodes;
+        break;
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
     return 0;
@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
     return 0;
 }
 
+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
+    /* only update if all parameters are valid */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParams");
+    /* First check cParams, because we want to update all or none. */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
+    /* Finally set cParams, which should succeed. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
+    return 0;
+}
+
 size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                     "Can't set pledgedSrcSize when not in init stage.");
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
         ZSTD_compressionParameters* cParams);
 
 /*
- * Initializes the local dict using the requested parameters.
- * NOTE: This does not use the pledged src size, because it may be used for more
- * than one compression.
+ * Initializes the local dictionary using requested parameters.
+ * NOTE: Initialization does not employ the pledged src size,
+ * because the dictionary may be used for multiple compressions.
  */
 static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 {
@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
         return 0;
     }
     if (dl->cdict != NULL) {
-        assert(cctx->cdict == dl->cdict);
         /* Local dictionary already initialized. */
+        assert(cctx->cdict == dl->cdict);
         return 0;
     }
     assert(dl->dictSize > 0);
@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 }
 
 size_t ZSTD_CCtx_loadDictionary_advanced(
-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+        ZSTD_CCtx* cctx,
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType)
 {
-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                    "Can't load a dictionary when ctx is not in init stage.");
     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when cctx is not in init stage.");
+    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
+    if (dict == NULL || dictSize == 0)  /* no dictionary */
         return 0;
     if (dictLoadMethod == ZSTD_dlm_byRef) {
         cctx->localDict.dict = dict;
     } else {
+        /* copy dictionary content inside CCtx to own its lifetime */
         void* dictBuffer;
         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
-                        "no malloc for static CCtx");
+                        "static CCtx can't allocate for an internal copy of dictionary");
         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
+                        "allocation failed for dictionary content");
         ZSTD_memcpy(dictBuffer, dict, dictSize);
-        cctx->localDict.dictBuffer = dictBuffer;
-        cctx->localDict.dict = dictBuffer;
+        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
+        cctx->localDict.dict = dictBuffer;        /* read-only reference */
     }
     cctx->localDict.dictSize = dictSize;
     cctx->localDict.dictContentType = dictContentType;
@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
     if ( (reset == ZSTD_reset_parameters)
       || (reset == ZSTD_reset_session_and_parameters) ) {
         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                        "Can't reset parameters only when not in init stage.");
+                        "Reset parameters is only possible during init stage.");
         ZSTD_clearAllDicts(cctx);
+        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
     }
     return 0;
@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
 ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                             unsigned long long srcSize,
                             size_t dictSize,
-                            ZSTD_cParamMode_e mode)
+                            ZSTD_cParamMode_e mode,
+                            ZSTD_paramSwitch_e useRowMatchFinder)
 {
     const U64 minSrcSize = 513; /* (1<<9) + 1 */
     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     }
 
     /* resize windowLog if input is small enough, to use less memory */
-    if ( (srcSize < maxWindowResize)
-      && (dictSize < maxWindowResize) )  {
+    if ( (srcSize <= maxWindowResize)
+      && (dictSize <= maxWindowResize) )  {
         U32 const tSize = (U32)(srcSize + dictSize);
         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
 
+    /* We can't use more than 32 bits of hash in total, so that means that we require:
+     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
+     */
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+        if (cPar.chainLog > maxShortCacheHashLog) {
+            cPar.chainLog = maxShortCacheHashLog;
+        }
+    }
+
+
+    /* At this point, we aren't 100% sure if we are using the row match finder.
+     * Unless it is explicitly disabled, conservatively assume that it is enabled.
+     * In this case it will only be disabled for small sources, so shrinking the
+     * hash log a little bit shouldn't result in any ratio loss.
+     */
+    if (useRowMatchFinder == ZSTD_ps_auto)
+        useRowMatchFinder = ZSTD_ps_enable;
+
+    /* We can't hash more than 32-bits in total. So that means that we require:
+     * (hashLog - rowLog + 8) <= 32
+     */
+    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
+        /* Switch to 32-entry rows if searchLog is 5 (or more) */
+        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
+        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
+        U32 const maxHashLog = maxRowHashLog + rowLog;
+        assert(cPar.hashLog >= rowLog);
+        if (cPar.hashLog > maxHashLog) {
+            cPar.hashLog = maxHashLog;
+        }
+    }
+
     return cPar;
 }
 
@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
 {
     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
 }
 
 static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
     assert(!ZSTD_checkCParams(cParams));
     /* srcSizeHint == 0 means 0 */
-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
 }
 
 static size_t
@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
                                             : 0;
     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
                                 ? optPotentialSpace
@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
 }
 
+/* Helper function for calculating memory requirements.
+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
+    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
+    return blockSize / divider;
+}
+
 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_compressionParameters* cParams,
         const ldmParams_t* ldmParams,
@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_paramSwitch_e useRowMatchFinder,
         const size_t buffInSize,
         const size_t buffOutSize,
-        const U64 pledgedSrcSize)
+        const U64 pledgedSrcSize,
+        int useSequenceProducer,
+        size_t maxBlockSize)
 {
     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
-    size_t const maxNbSeq = blockSize / divider;
+    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
+    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
 
     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
 
+    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+    size_t const externalSeqSpace = useSequenceProducer
+        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
+        : 0;
+
     size_t const neededSpace =
         cctxSpace +
         entropySpace +
@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         ldmSeqSpace +
         matchStateSize +
         tokenSpace +
-        bufferSpace;
+        bufferSpace +
+        externalSeqSpace;
 
     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
     return neededSpace;
@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
      * be needed. However, we still allocate two 0-sized buffers, which can
      * take space under ASAN. */
     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
 }
 
 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
     {   ZSTD_compressionParameters const cParams =
                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
                 ? ((size_t)1 << cParams.windowLog) + blockSize
                 : 0;
@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
-            ZSTD_CONTENTSIZE_UNKNOWN);
+            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
     }
 }
 
@@ -1637,6 +1833,19 @@ typedef enum {
     ZSTD_resetTarget_CCtx
 } ZSTD_resetTarget_e;
 
+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
+static U64 ZSTD_bitmix(U64 val, U64 len) {
+    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
+    val *= 0x9FB21C651E98DF25ULL;
+    val ^= (val >> 35) + len ;
+    val *= 0x9FB21C651E98DF25ULL;
+    return val ^ (val >> 28);
+}
+
+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
+    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
+}
 
 static size_t
 ZSTD_reset_matchState(ZSTD_matchState_t* ms,
@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
     }
 
     ms->hashLog3 = hashLog3;
+    ms->lazySkipping = 0;
 
     ZSTD_invalidateMatchState(ms);
 
@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ZSTD_cwksp_clean_tables(ws);
     }
 
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        /* Row match finder needs an additional table of hashes ("tags") */
+        size_t const tagTableSize = hSize;
+        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
+         * 0 when we reset a Cdict */
+        if(forWho == ZSTD_resetTarget_CCtx) {
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
+            ZSTD_advanceHashSalt(ms);
+        } else {
+            /* When we are not salting we want to always memset the memory */
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            ZSTD_memset(ms->tagTable, 0, tagTableSize);
+            ms->hashSalt = 0;
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+            assert(cParams->hashLog >= rowLog);
+            ms->rowHashLog = cParams->hashLog - rowLog;
+        }
+    }
+
     /* opt parser space */
     if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
         DEBUGLOG(4, "reserving optimal parser space");
@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     }
 
-    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
-        {   /* Row match finder needs an additional table of hashes ("tags") */
-            size_t const tagTableSize = hSize*sizeof(U16);
-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
-        }
-        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
-            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
-            assert(cParams->hashLog >= rowLog);
-            ms->rowHashLog = cParams->hashLog - rowLog;
-        }
-    }
-
     ms->cParams = *cParams;
 
     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     assert(params->useRowMatchFinder != ZSTD_ps_auto);
     assert(params->useBlockSplitter != ZSTD_ps_auto);
     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+    assert(params->maxBlockSize != 0);
     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
         /* Adjust long distance matching parameters */
         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     }
 
     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
-        size_t const maxNbSeq = blockSize / divider;
+        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
+        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         size_t const neededSpace =
             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
-                buffInSize, buffOutSize, pledgedSrcSize);
+                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
         int resizeWorkspace;
 
         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         /* init params */
         zc->blockState.matchState.cParams = params->cParams;
+        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
 
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+                &zc->blockState.matchState,
+                ws,
+                &params->cParams,
+                params->useRowMatchFinder,
+                crp,
+                needsIndexReset,
+                ZSTD_resetTarget_CCtx), "");
+
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        /* ldm hash table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        /* reserve space for block-level external sequences */
+        if (params->useSequenceProducer) {
+            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
+            zc->externalMatchCtx.seqBuffer =
+                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
+        }
+
+        /* buffers */
+
         /* ZSTD_wildcopy() is used to copy into the literals buffer,
          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
          */
         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
         zc->seqStore.maxNbLit = blockSize;
 
-        /* buffers */
         zc->bufferedPolicy = zbuff;
         zc->inBuffSize = buffInSize;
         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
-
-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
-            &zc->blockState.matchState,
-            ws,
-            &params->cParams,
-            params->useRowMatchFinder,
-            crp,
-            needsIndexReset,
-            ZSTD_resetTarget_CCtx), "");
-
-        /* ldm hash table */
-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
-            /* TODO: avoid memset? */
-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
-            zc->maxNbLdmSequences = maxNbLdmSeq;
-
-            ZSTD_window_init(&zc->ldmState.window);
-            zc->ldmState.loadedDictEnd = 0;
-        }
 
         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
 
         zc->initialized = 1;
 
@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
         }
 
         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
+                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
+                                                     params.useRowMatchFinder);
         params.cParams.windowLog = windowLog;
         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
     return 0;
 }
 
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
 static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                             const ZSTD_CDict* cdict,
                             ZSTD_CCtx_params params,
@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                             : 0;
         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
 
-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
-               cdict->matchState.hashTable,
-               hSize * sizeof(U32));
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
-               cdict->matchState.chainTable,
-               chainSize * sizeof(U32));
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
         }
         /* copy tag table */
         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
-            size_t const tagTableSize = hSize*sizeof(U16);
+            size_t const tagTableSize = hSize;
             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
-                cdict->matchState.tagTable,
-                tagTableSize);
+                        cdict->matchState.tagTable,
+                        tagTableSize);
+            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
         }
     }
 
@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
         params.ldmParams = srcCCtx->appliedParams.ldmParams;
         params.fParams = fParams;
+        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
                                 /* loadedDictSize */ 0,
                                 ZSTDcrp_leaveDirty, zbuff);
@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
 {
     const seqDef* const sequences = seqStorePtr->sequencesStart;
     BYTE* const llCodeTable = seqStorePtr->llCode;
@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    int longOffsets = 0;
     assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
+        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
         U32 const mlv = sequences[u].mlBase;
         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
+        ofCodeTable[u] = (BYTE)ofCode;
         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
+        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
+            longOffsets = 1;
     }
     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+    return longOffsets;
 }
 
 /* ZSTD_useTargetCBlockSize():
@@ -2347,6 +2602,7 @@ typedef struct {
     U32 MLtype;
     size_t size;
     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+    int longOffsets;
 } ZSTD_symbolEncodingTypeStats_t;
 
 /* ZSTD_buildSequencesStatistics():
@@ -2357,11 +2613,13 @@ typedef struct {
  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
-                              BYTE* dst, const BYTE* const dstEnd,
-                              ZSTD_strategy strategy, unsigned* countWorkspace,
-                              void* entropyWorkspace, size_t entropyWkspSize) {
+ZSTD_buildSequencesStatistics(
+                const seqStore_t* seqStorePtr, size_t nbSeq,
+                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+                      BYTE* dst, const BYTE* const dstEnd,
+                      ZSTD_strategy strategy, unsigned* countWorkspace,
+                      void* entropyWorkspace, size_t entropyWkspSize)
+{
     BYTE* const ostart = dst;
     const BYTE* const oend = dstEnd;
     BYTE* op = ostart;
@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
 
     stats.lastCountSize = 0;
     /* convert length/distances into codes */
-    ZSTD_seqToCodes(seqStorePtr);
+    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
     assert(op <= oend);
     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
     /* build CTable for Literal Lengths */
@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
  */
 #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
-                          const ZSTD_entropyCTables_t* prevEntropy,
-                                ZSTD_entropyCTables_t* nextEntropy,
-                          const ZSTD_CCtx_params* cctxParams,
-                                void* dst, size_t dstCapacity,
-                                void* entropyWorkspace, size_t entropyWkspSize,
-                          const int bmi2)
+ZSTD_entropyCompressSeqStore_internal(
+                        const seqStore_t* seqStorePtr,
+                        const ZSTD_entropyCTables_t* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                        const ZSTD_CCtx_params* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              void* entropyWorkspace, size_t entropyWkspSize,
+                        const int bmi2)
 {
-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
     unsigned* count = (unsigned*)entropyWorkspace;
     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
     const seqDef* const sequences = seqStorePtr->sequencesStart;
-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
     const BYTE* const llCodeTable = seqStorePtr->llCode;
     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
     BYTE* const oend = ostart + dstCapacity;
     BYTE* op = ostart;
     size_t lastCountSize;
+    int longOffsets = 0;
 
     entropyWorkspace = count + (MaxSeq + 1);
     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
 
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
 
     /* Compress literals */
     {   const BYTE* const literals = seqStorePtr->litStart;
-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
+        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
         /* Base suspicion of uncompressibility on ratio of literals to sequences */
         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+
         size_t const cSize = ZSTD_compressLiterals(
-                                    &prevEntropy->huf, &nextEntropy->huf,
-                                    cctxParams->cParams.strategy,
-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
                                     op, dstCapacity,
                                     literals, litSize,
                                     entropyWorkspace, entropyWkspSize,
-                                    bmi2, suspectUncompressible);
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                    suspectUncompressible, bmi2);
         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
         assert(cSize <= dstCapacity);
         op += cSize;
@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
         return (size_t)(op - ostart);
     }
-    {
-        ZSTD_symbolEncodingTypeStats_t stats;
-        BYTE* seqHead = op++;
+    {   BYTE* const seqHead = op++;
         /* build stats for sequences */
-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+        const ZSTD_symbolEncodingTypeStats_t stats =
+                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
                                              &prevEntropy->fse, &nextEntropy->fse,
                                               op, oend,
                                               strategy, count,
@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
         lastCountSize = stats.lastCountSize;
         op += stats.size;
+        longOffsets = stats.longOffsets;
     }
 
     {   size_t const bitstreamSize = ZSTD_encodeSequences(
@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
 }
 
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
-                       const ZSTD_entropyCTables_t* prevEntropy,
-                             ZSTD_entropyCTables_t* nextEntropy,
-                       const ZSTD_CCtx_params* cctxParams,
-                             void* dst, size_t dstCapacity,
-                             size_t srcSize,
-                             void* entropyWorkspace, size_t entropyWkspSize,
-                             int bmi2)
+ZSTD_entropyCompressSeqStore(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          void* dst, size_t dstCapacity,
+                          size_t srcSize,
+                          void* entropyWorkspace, size_t entropyWkspSize,
+                          int bmi2)
 {
     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
+        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
         return 0;  /* block not compressed */
+    }
     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
 
     /* Check compressibility */
     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
+     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
+     */
+    assert(cSize < ZSTD_BLOCKSIZE_MAX);
     return cSize;
 }
 
@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
     ssPtr->longLengthType = ZSTD_llt_none;
 }
 
+/* ZSTD_postProcessSequenceProducerResult() :
+ * Validates and post-processes sequences obtained through the external matchfinder API:
+ *   - Checks whether nbExternalSeqs represents an error condition.
+ *   - Appends a block delimiter to outSeqs if one is not already present.
+ *     See zstd.h for context regarding block delimiters.
+ * Returns the number of sequences after post-processing, or an error code. */
+static size_t ZSTD_postProcessSequenceProducerResult(
+    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
+) {
+    RETURN_ERROR_IF(
+        nbExternalSeqs > outSeqsCapacity,
+        sequenceProducer_failed,
+        "External sequence producer returned error code %lu",
+        (unsigned long)nbExternalSeqs
+    );
+
+    RETURN_ERROR_IF(
+        nbExternalSeqs == 0 && srcSize > 0,
+        sequenceProducer_failed,
+        "Got zero sequences from external sequence producer for a non-empty src buffer!"
+    );
+
+    if (srcSize == 0) {
+        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
+        return 1;
+    }
+
+    {
+        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
+
+        /* We can return early if lastSeq is already a block delimiter. */
+        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
+            return nbExternalSeqs;
+        }
+
+        /* This error condition is only possible if the external matchfinder
+         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
+        RETURN_ERROR_IF(
+            nbExternalSeqs == outSeqsCapacity,
+            sequenceProducer_failed,
+            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
+        );
+
+        /* lastSeq is not a block delimiter, so we need to append one. */
+        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
+        return nbExternalSeqs + 1;
+    }
+}
+
+/* ZSTD_fastSequenceLengthSum() :
+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
+ * Similar to another function in zstd_compress.c (determine_blockSize),
+ * except it doesn't check for a block delimiter to end summation.
+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
+    size_t matchLenSum, litLenSum, i;
+    matchLenSum = 0;
+    litLenSum = 0;
+    for (i = 0; i < seqBufSize; i++) {
+        litLenSum += seqBuf[i].litLength;
+        matchLenSum += seqBuf[i].matchLength;
+    }
+    return litLenSum + matchLenSum;
+}
+
 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
 
 static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     /* Assert that we have correctly flushed the ctx params into the ms's copy */
     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+     * additional 1. We need to revisit and change this logic to be more consistent */
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
         } else {
@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         }
         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
+
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                zc->appliedParams.useSequenceProducer,
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             /* Updates ldmSeqStore.pos */
             lastLLSize =
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
 
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                zc->appliedParams.useSequenceProducer,
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             ldmSeqStore.seq = zc->ldmSequences;
             ldmSeqStore.capacity = zc->maxNbLdmSequences;
             /* Updates ldmSeqStore.size */
@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                        zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
-        } else {   /* not long range mode */
+        } else if (zc->appliedParams.useSequenceProducer) {
+            assert(
+                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
+            );
+            assert(zc->externalMatchCtx.mFinder != NULL);
+
+            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
+
+                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
+                    zc->externalMatchCtx.mState,
+                    zc->externalMatchCtx.seqBuffer,
+                    zc->externalMatchCtx.seqBufferCapacity,
+                    src, srcSize,
+                    NULL, 0,  /* dict and dictSize, currently not supported */
+                    zc->appliedParams.compressionLevel,
+                    windowSize
+                );
+
+                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
+                    zc->externalMatchCtx.seqBuffer,
+                    nbExternalSeqs,
+                    zc->externalMatchCtx.seqBufferCapacity,
+                    srcSize
+                );
+
+                /* Return early if there is no error, since we don't need to worry about last literals */
+                if (!ZSTD_isError(nbPostProcessedSeqs)) {
+                    ZSTD_sequencePosition seqPos = {0,0,0};
+                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
+                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
+                    FORWARD_IF_ERROR(
+                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
+                            zc, &seqPos,
+                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
+                            src, srcSize,
+                            zc->appliedParams.searchForExternalRepcodes
+                        ),
+                        "Failed to copy external sequences to seqStore!"
+                    );
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
+                    return ZSTDbss_compress;
+                }
+
+                /* Propagate the error if fallback is disabled */
+                if (!zc->appliedParams.enableMatchFinderFallback) {
+                    return nbPostProcessedSeqs;
+                }
+
+                /* Fallback to software matchfinder */
+                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                            zc->appliedParams.useRowMatchFinder,
+                                                                                            dictMode);
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(
+                        5,
+                        "External sequence producer returned error code %lu. Falling back to internal parser.",
+                        (unsigned long)nbExternalSeqs
+                    );
+                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+            }   }
+        } else {   /* not long range mode and no external matchfinder */
             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
                                                                                     zc->appliedParams.useRowMatchFinder,
                                                                                     dictMode);
@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
         /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
            so we provide seqStoreSeqs[i].offset - 1 */
         ZSTD_updateRep(updatedRepcodes.rep,
-                       seqStoreSeqs[i].offBase - 1,
+                       seqStoreSeqs[i].offBase,
                        seqStoreSeqs[i].litLength == 0);
         literalsRead += outSeqs[i].litLength;
     }
@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
     zc->seqCollector.seqIndex += seqStoreSeqSize;
 }
 
+size_t ZSTD_sequenceBound(size_t srcSize) {
+    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
+}
+
 size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
                               size_t outSeqsSize, const void* src, size_t srcSize)
 {
@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
     const size_t unrollMask = unrollSize - 1;
     const size_t prefixLength = length & unrollMask;
     size_t i;
-    size_t u;
     if (length == 1) return 1;
     /* Check if prefix is RLE first before using unrolled loop */
     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
         return 0;
     }
     for (i = prefixLength; i != length; i += unrollSize) {
+        size_t u;
         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
             if (MEM_readST(ip + i + u) != valueST) {
                 return 0;
-            }
-        }
-    }
+    }   }   }
     return 1;
 }
 
@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
     return nbSeqs < 4 && nbLits < 10;
 }
 
-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+static void
+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
 {
     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
     bs->prevCBlock = bs->nextCBlock;
@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
 }
 
 /* Writes the block header */
-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
+static void
+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
+{
     U32 const cBlockHeader = cSize == 1 ?
                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
  *  Stores literals block type (raw, rle, compressed, repeat) and
  *  huffman description table to hufMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE workspace
- *  @return : size of huffman description table or error code */
-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
-                                            const ZSTD_hufCTables_t* prevHuf,
-                                                  ZSTD_hufCTables_t* nextHuf,
-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                  const int literalsCompressionIsDisabled,
-                                                  void* workspace, size_t wkspSize)
+ * @return : size of huffman description table, or an error code
+ */
+static size_t
+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+                               const ZSTD_hufCTables_t* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
+                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               const int literalsCompressionIsDisabled,
+                                     void* workspace, size_t wkspSize,
+                                     int hufFlags)
 {
     BYTE* const wkspStart = (BYTE*)workspace;
     BYTE* const wkspEnd = wkspStart + wkspSize;
@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
     unsigned* const countWksp = (unsigned*)workspace;
     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
     BYTE* const nodeWksp = countWkspStart + countWkspSize;
-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
+    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
+    unsigned huffLog = LitHufLog;
     HUF_repeat repeat = prevHuf->repeatMode;
     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
 
@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
 
     /* small ? don't even attempt compression (speed opt) */
 #ifndef COMPRESS_LITERALS_SIZE_MIN
-#define COMPRESS_LITERALS_SIZE_MIN 63
+# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
 #endif
     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
         if (srcSize <= minLitSize) {
             DEBUGLOG(5, "set_basic - too small");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Scan input and build symbol stats */
-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+    {   size_t const largest =
+            HIST_count_wksp (countWksp, &maxSymbolValue,
+                            (const BYTE*)src, srcSize,
+                            workspace, wkspSize);
         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
         if (largest == srcSize) {
+            /* only one literal symbol */
             DEBUGLOG(5, "set_rle");
             hufMetadata->hType = set_rle;
             return 0;
         }
         if (largest <= (srcSize >> 7)+4) {
+            /* heuristic: likely not compressible */
             DEBUGLOG(5, "set_basic - no gain");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Validate the previous Huffman table */
-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+    if (repeat == HUF_repeat_check
+      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
         repeat = HUF_repeat_none;
     }
 
     /* Build Huffman Tree */
     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
+    assert(huffLog <= LitHufLog);
     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
                                                     maxSymbolValue, huffLog,
                                                     nodeWksp, nodeWkspSize);
         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
         huffLog = (U32)maxBits;
-        {   /* Build and write the CTable */
-            size_t const newCSize = HUF_estimateCompressedSize(
-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-            size_t const hSize = HUF_writeCTable_wksp(
-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
-                    nodeWksp, nodeWkspSize);
-            /* Check against repeating the previous CTable */
-            if (repeat != HUF_repeat_none) {
-                size_t const oldCSize = HUF_estimateCompressedSize(
-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
-                    DEBUGLOG(5, "set_repeat - smaller");
-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                    hufMetadata->hType = set_repeat;
-                    return 0;
-                }
-            }
-            if (newCSize + hSize >= srcSize) {
-                DEBUGLOG(5, "set_basic - no gains");
+    }
+    {   /* Build and write the CTable */
+        size_t const newCSize = HUF_estimateCompressedSize(
+                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+        size_t const hSize = HUF_writeCTable_wksp(
+                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                nodeWksp, nodeWkspSize);
+        /* Check against repeating the previous CTable */
+        if (repeat != HUF_repeat_none) {
+            size_t const oldCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                DEBUGLOG(5, "set_repeat - smaller");
                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                hufMetadata->hType = set_basic;
+                hufMetadata->hType = set_repeat;
                 return 0;
-            }
-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
-            hufMetadata->hType = set_compressed;
-            nextHuf->repeatMode = HUF_repeat_check;
-            return hSize;
+        }   }
+        if (newCSize + hSize >= srcSize) {
+            DEBUGLOG(5, "set_basic - no gains");
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            hufMetadata->hType = set_basic;
+            return 0;
         }
+        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+        hufMetadata->hType = set_compressed;
+        nextHuf->repeatMode = HUF_repeat_check;
+        return hSize;
     }
 }
 
@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
  * and updates nextEntropy to the appropriate repeatMode.
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
+{
+    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
     nextEntropy->litlength_repeatMode = FSE_repeat_none;
     nextEntropy->offcode_repeatMode = FSE_repeat_none;
     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
  *  Builds entropy for the sequences.
  *  Stores symbol compression modes and fse table to fseMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
- *  @return : size of fse tables or error code */
-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
-                                              const ZSTD_fseCTables_t* prevEntropy,
-                                                    ZSTD_fseCTables_t* nextEntropy,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                    void* workspace, size_t wkspSize)
+ * @return : size of fse tables or error code */
+static size_t
+ZSTD_buildBlockEntropyStats_sequences(
+                const seqStore_t* seqStorePtr,
+                const ZSTD_fseCTables_t* prevEntropy,
+                      ZSTD_fseCTables_t* nextEntropy,
+                const ZSTD_CCtx_params* cctxParams,
+                      ZSTD_fseCTablesMetadata_t* fseMetadata,
+                      void* workspace, size_t wkspSize)
 {
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     BYTE* const ostart = fseMetadata->fseTablesBuffer;
     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
     BYTE* op = ostart;
@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
- *
- *  @return : 0 on success or error code
+ * @return : 0 on success, or an error code
+ *  Note : also employed in superblock
  */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize)
-{
-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+size_t ZSTD_buildBlockEntropyStats(
+            const seqStore_t* seqStorePtr,
+            const ZSTD_entropyCTables_t* prevEntropy,
+                  ZSTD_entropyCTables_t* nextEntropy,
+            const ZSTD_CCtx_params* cctxParams,
+                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                  void* workspace, size_t wkspSize)
+{
+    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
+    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
+
     entropyMetadata->hufMetadata.hufDesSize =
         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
                                             &prevEntropy->huf, &nextEntropy->huf,
                                             &entropyMetadata->hufMetadata,
                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
-                                            workspace, wkspSize);
+                                            workspace, wkspSize, hufFlags);
+
     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
     entropyMetadata->fseMetadata.fseTablesSize =
         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
 }
 
 /* Returns the size estimate for the literals section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
-                                                const ZSTD_hufCTables_t* huf,
-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                void* workspace, size_t wkspSize,
-                                                int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+                               const ZSTD_hufCTables_t* huf,
+                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               void* workspace, size_t wkspSize,
+                               int writeEntropy)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
 }
 
 /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
-                        const FSE_CTable* fseCTable,
-                        const U8* additionalBits,
-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                        void* workspace, size_t wkspSize)
+static size_t
+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+                    const FSE_CTable* fseCTable,
+                    const U8* additionalBits,
+                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                    void* workspace, size_t wkspSize)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     const BYTE* ctp = codeTable;
@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
 }
 
 /* Returns the size estimate for the sequences section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
-                                                  const BYTE* llCodeTable,
-                                                  const BYTE* mlCodeTable,
-                                                  size_t nbSeq,
-                                                  const ZSTD_fseCTables_t* fseTables,
-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                  void* workspace, size_t wkspSize,
-                                                  int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                 const BYTE* llCodeTable,
+                                 const BYTE* mlCodeTable,
+                                 size_t nbSeq,
+                                 const ZSTD_fseCTables_t* fseTables,
+                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                 void* workspace, size_t wkspSize,
+                                 int writeEntropy)
 {
     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
     size_t cSeqSizeEstimate = 0;
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
-                                         fseTables->offcodeCTable, NULL,
-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                                         workspace, wkspSize);
+                                    fseTables->offcodeCTable, NULL,
+                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
-                                         fseTables->litlengthCTable, LL_bits,
-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                                         workspace, wkspSize);
+                                    fseTables->litlengthCTable, LL_bits,
+                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
-                                         fseTables->matchlengthCTable, ML_bits,
-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
-                                         workspace, wkspSize);
+                                    fseTables->matchlengthCTable, ML_bits,
+                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                    workspace, wkspSize);
     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
     return cSeqSizeEstimate + sequencesSectionHeaderSize;
 }
 
 /* Returns the size estimate for a given stream of literals, of, ll, ml */
-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
-                                     const BYTE* ofCodeTable,
-                                     const BYTE* llCodeTable,
-                                     const BYTE* mlCodeTable,
-                                     size_t nbSeq,
-                                     const ZSTD_entropyCTables_t* entropy,
-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                     void* workspace, size_t wkspSize,
-                                     int writeLitEntropy, int writeSeqEntropy) {
+static size_t
+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                       const BYTE* ofCodeTable,
+                       const BYTE* llCodeTable,
+                       const BYTE* mlCodeTable,
+                       size_t nbSeq,
+                       const ZSTD_entropyCTables_t* entropy,
+                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                       void* workspace, size_t wkspSize,
+                       int writeLitEntropy, int writeSeqEntropy)
+{
     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
-                                                         workspace, wkspSize, writeLitEntropy);
+                                    &entropy->huf, &entropyMetadata->hufMetadata,
+                                    workspace, wkspSize, writeLitEntropy);
     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-                                                         workspace, wkspSize, writeSeqEntropy);
+                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                    workspace, wkspSize, writeSeqEntropy);
     return seqSize + literalsSize + ZSTD_blockHeaderSize;
 }
 
 /* Builds entropy statistics and uses them for blocksize estimation.
  *
- * Returns the estimated compressed size of the seqStore, or a zstd error.
+ * @return: estimated compressed size of the seqStore, or a zstd error.
  */
-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+static size_t
+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
+{
+    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
                     &zc->blockState.prevCBlock->entropy,
                     &zc->blockState.nextCBlock->entropy,
                     &zc->appliedParams,
                     entropyMetadata,
-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
+    return ZSTD_estimateBlockSize(
+                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    &zc->blockState.nextCBlock->entropy,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
 }
 
 /* Returns literals bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
+{
     size_t literalsBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
-        seqDef seq = seqStore->sequencesStart[i];
+        seqDef const seq = seqStore->sequencesStart[i];
         literalsBytes += seq.litLength;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
             literalsBytes += 0x10000;
-        }
-    }
+    }   }
     return literalsBytes;
 }
 
 /* Returns match bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
+{
     size_t matchBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
         seqDef seq = seqStore->sequencesStart[i];
         matchBytes += seq.mlBase + MINMATCH;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
             matchBytes += 0x10000;
-        }
-    }
+    }   }
     return matchBytes;
 }
 
@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
  */
 static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
                                const seqStore_t* originalSeqStore,
-                                     size_t startIdx, size_t endIdx) {
-    BYTE* const litEnd = originalSeqStore->lit;
-    size_t literalsBytes;
-    size_t literalsBytesPreceding = 0;
-
+                                     size_t startIdx, size_t endIdx)
+{
     *resultSeqStore = *originalSeqStore;
     if (startIdx > 0) {
         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
     }
 
     /* Move longLengthPos into the correct position if necessary */
@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
     }
     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
-    resultSeqStore->litStart += literalsBytesPreceding;
     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
-        resultSeqStore->lit = litEnd;
+        assert(resultSeqStore->lit == originalSeqStore->lit);
     } else {
-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
+        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
     }
     resultSeqStore->llCode += startIdx;
     resultSeqStore->mlCode += startIdx;
@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
 }
 
 /*
- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
  */
 static U32
-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
-{
-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
-    assert(STORED_IS_REPCODE(offCode));
-    if (adjustedOffCode == ZSTD_REP_NUM) {
-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
-        assert(rep[0] > 0);
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+{
+    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
+    assert(OFFBASE_IS_REPCODE(offBase));
+    if (adjustedRepCode == ZSTD_REP_NUM) {
+        assert(ll0);
+        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
+         * This is only valid if it results in a valid offset value, aka > 0.
+         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
+         * In which case this function will return 0, which is an invalid offset.
+         * It's not an issue though, since this value will be
+         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
+         */
         return rep[0] - 1;
     }
-    return rep[adjustedOffCode];
+    return rep[adjustedRepCode];
 }
 
 /*
@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
  *        1-3 : repcode 1-3
  *        4+ : real_offset+3
  */
-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
-                                          seqStore_t* const seqStore, U32 const nbSeq) {
+static void
+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+                        const seqStore_t* const seqStore, U32 const nbSeq)
+{
     U32 idx = 0;
+    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
     for (; idx < nbSeq; ++idx) {
         seqDef* const seq = seqStore->sequencesStart + idx;
-        U32 const ll0 = (seq->litLength == 0);
-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
-        assert(seq->offBase > 0);
-        if (STORED_IS_REPCODE(offCode)) {
-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
+        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
+        U32 const offBase = seq->offBase;
+        assert(offBase > 0);
+        if (OFFBASE_IS_REPCODE(offBase)) {
+            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
+            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
              * the repcode with the offset it actually references, determined by the compression
              * repcode history.
              */
             if (dRawOffset != cRawOffset) {
-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
+                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
             }
         }
         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
          */
-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
+        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
+        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
     }
 }
 
@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
  * Returns the total size of that block (including header) or a ZSTD error code.
  */
 static size_t
-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
+                            const seqStore_t* const seqStore,
                                   repcodes_t* const dRep, repcodes_t* const cRep,
                                   void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize,
+                            const void* src, size_t srcSize,
                                   U32 lastBlock, U32 isPartition)
 {
     const U32 rleMaxLength = 25;
@@ -3481,45 +3930,49 @@ typedef struct {
 
 /* Helper function to perform the recursive search for block splits.
  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
- * we do not recurse.
+ * If advantageous to split, then we recurse down the two sub-blocks.
+ * If not, or if an error occurred in estimation, then we do not recurse.
  *
- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
  * In practice, recursion depth usually doesn't go beyond 4.
  *
- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
  * maximum of 128 KB, this value is actually impossible to reach.
  */
 static void
 ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
 {
-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
     size_t estimatedOriginalSize;
     size_t estimatedFirstHalfSize;
     size_t estimatedSecondHalfSize;
     size_t midIdx = (startIdx + endIdx)/2;
 
+    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+    assert(endIdx >= startIdx);
     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
         return;
     }
-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
         return;
     }
     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
+        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
         splits->splitLocations[splits->idx] = (U32)midIdx;
         splits->idx++;
@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
     }
 }
 
-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
+/* Base recursive function.
+ * Populates a table with intra-block partition indices that can improve compression ratio.
  *
- * Returns the number of splits made (which equals the size of the partition table - 1).
+ * @return: number of splits made (which equals the size of the partition table - 1).
  */
-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
-    seqStoreSplits splits = {partitions, 0};
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+{
+    seqStoreSplits splits;
+    splits.splitLocations = partitions;
+    splits.idx = 0;
     if (nbSeq <= 4) {
-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
+        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
         /* Refuse to try and split anything with less than 4 sequences */
         return 0;
     }
@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
  */
 static size_t
-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t blockSize,
+                                    U32 lastBlock, U32 nbSeq)
 {
     size_t cSize = 0;
     const BYTE* ip = (const BYTE*)src;
     BYTE* op = (BYTE*)dst;
     size_t i = 0;
     size_t srcBytesTotal = 0;
-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
+    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
 
     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
 
-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
                 (unsigned)zc->blockState.matchState.nextToUpdate);
 
     if (numSplits == 0) {
-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
-                                                                   &dRep, &cRep,
-                                                                    op, dstCapacity,
-                                                                    ip, blockSize,
-                                                                    lastBlock, 0 /* isPartition */);
+        size_t cSizeSingleBlock =
+            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                            &dRep, &cRep,
+                                            op, dstCapacity,
+                                            ip, blockSize,
+                                            lastBlock, 0 /* isPartition */);
         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
+        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
         return cSizeSingleBlock;
     }
 
     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
     for (i = 0; i <= numSplits; ++i) {
-        size_t srcBytes;
         size_t cSizeChunk;
         U32 const lastPartition = (i == numSplits);
         U32 lastBlockEntireSrc = 0;
 
-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
         srcBytesTotal += srcBytes;
         if (lastPartition) {
             /* This is the final partition, need to account for possible last literals */
@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
                                                        op, dstCapacity,
                                                        ip, srcBytes,
                                                        lastBlockEntireSrc, 1 /* isPartition */);
-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
+                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
 
         ip += srcBytes;
@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
         dstCapacity -= cSizeChunk;
         cSize += cSizeChunk;
         *currSeqStore = *nextSeqStore;
-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
     }
-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
-     * for the next block.
+    /* cRep and dRep may have diverged during the compression.
+     * If so, we use the dRep repcodes for the next block.
      */
     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
     return cSize;
@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
                               void* dst, size_t dstCapacity,
                               const void* src, size_t srcSize, U32 lastBlock)
 {
-    const BYTE* ip = (const BYTE*)src;
-    BYTE* op = (BYTE*)dst;
     U32 nbSeq;
     size_t cSize;
     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
         if (bss == ZSTDbss_noCompress) {
             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
             return cSize;
@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                             void* dst, size_t dstCapacity,
                             const void* src, size_t srcSize, U32 frame)
 {
-    /* This the upper bound for the length of an rle block.
-     * This isn't the actual upper bound. Finding the real threshold
-     * needs further investigation.
+    /* This is an estimated upper bound for the length of an rle block.
+     * This isn't the actual upper bound.
+     * Finding the real threshold needs further investigation.
      */
     const U32 rleMaxLength = 25;
     size_t cSize;
@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
          *     emit an uncompressed block.
          */
-        {
-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+        {   size_t const cSize =
+                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
             if (cSize != ERROR(dstSize_tooSmall)) {
-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                size_t const maxCSize =
+                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                 }
             }
         }
-    }
+    } /* if (bss == ZSTDbss_compress)*/
 
     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
     /* Superblock compression failed, attempt to emit a single no compress block.
@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
 *   All blocks will be terminated, all input will be consumed.
 *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
 *   Frame is supposed already started (header already produced)
-*   @return : compressed size, or an error code
+*  @return : compressed size, or an error code
 */
 static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
 
-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
                         dstSize_tooSmall,
                         "not enough space to store compressed block");
         if (remaining < blockSize) blockSize = remaining;
@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                     MEM_writeLE24(op, cBlockHeader);
                     cSize += ZSTD_blockHeaderSize;
                 }
-            }
+            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
 
 
             ip += blockSize;
@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
     }
 }
 
-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize)
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressContinue_public() */
+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
+                             void* dst, size_t dstCapacity,
+                       const void* src, size_t srcSize)
+{
+    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
+}
 
-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
 {
     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
     assert(!ZSTD_checkCParams(cParams));
-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
 }
 
-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    return ZSTD_getBlockSize_deprecated(cctx);
+}
+
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
 
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
+}
+
 /*! ZSTD_loadDictionaryContent() :
  *  @return : 0, or an error code
  */
@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                          ZSTD_cwksp* ws,
                                          ZSTD_CCtx_params const* params,
                                          const void* src, size_t srcSize,
-                                         ZSTD_dictTableLoadMethod_e dtlm)
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
 {
     const BYTE* ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
 
-    /* Assert that we the ms params match the params we're being given */
+    /* Assert that the ms params match the params we're being given */
     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
 
-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+    {   /* Ensure large dictionaries can't cause index overflow */
+
         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
          * Dictionaries right at the edge will immediately trigger overflow
          * correction, but I don't want to insert extra constraints here.
          */
-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
-        /* We must have cleared our windows when our source is this large. */
-        assert(ZSTD_window_isEmpty(ms->window));
-        if (loadLdmDict)
-            assert(ZSTD_window_isEmpty(ls->window));
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
         /* If the dictionary is too large, only load the suffix of the dictionary. */
         if (srcSize > maxDictSize) {
             ip = iend - maxDictSize;
@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         }
     }
 
-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+    }
     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
-    ms->forceNonContiguous = params->deterministicRefPrefix;
 
-    if (loadLdmDict) {
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+
+    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+    }
+
+    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
+    if (params->cParams.strategy < ZSTD_btultra) {
+        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+        }
     }
 
+    ms->nextToUpdate = (U32)(ip - ms->window.base);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+    ms->forceNonContiguous = params->deterministicRefPrefix;
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
 
-    if (loadLdmDict)
-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
-
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, dtlm);
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
         break;
 
     case ZSTD_greedy:
@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         } else {
             assert(params->useRowMatchFinder != ZSTD_ps_auto);
             if (params->useRowMatchFinder == ZSTD_ps_enable) {
-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
+                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                       ZSTD_CCtx_params const* params,
                                       const void* dict, size_t dictSize,
                                       ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
                                       void* workspace)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
     {
         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
     }
     return dictID;
 }
@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                          const void* dict, size_t dictSize,
                                ZSTD_dictContentType_e dictContentType,
                                ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
                                void* workspace)
 {
     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict restricted modes */
     if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
 
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictContentType == ZSTD_dct_auto) {
             DEBUGLOG(4, "raw content dictionary detected");
             return ZSTD_loadDictionaryContent(
-                ms, ls, ws, params, dict, dictSize, dtlm);
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
         }
         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
         assert(0);   /* impossible */
@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict as full zstd dictionary */
     return ZSTD_loadZstdDictionary(
-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
 }
 
 #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
 #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
 
 /*! ZSTD_compressBegin_internal() :
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
  * @return : 0, or an error code */
 static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                         cdict->dictContentSize, cdict->dictContentType, dtlm,
-                        cctx->entropyWorkspace)
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
               : ZSTD_compress_insertDictionary(
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
-                        dictContentType, dtlm, cctx->entropyWorkspace);
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
         assert(dictID <= UINT_MAX);
         cctx->dictID = (U32)dictID;
@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                             &cctxParams, pledgedSrcSize);
 }
 
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+static size_t
+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_CCtx_params cctxParams;
-    {
-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
     }
     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
 }
 
+size_t
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
+}
+
 size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
 {
-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
 }
 
 
@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
     (void)extraCSize;
 }
 
-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
-                         void* dst, size_t dstCapacity,
-                   const void* src, size_t srcSize)
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize)
 {
     size_t endResult;
     size_t const cSize = ZSTD_compressContinue_internal(cctx,
@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
     return cSize + endResult;
 }
 
+/* NOTE: Must just wrap ZSTD_compressEnd_public() */
+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
+                        void* dst, size_t dstCapacity,
+                  const void* src, size_t srcSize)
+{
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
 size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize,
@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                          params, srcSize, ZSTDb_not_buffered) , "");
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
         {   size_t const dictID = ZSTD_compress_insertDictionary(
                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                     &params, cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
             assert(dictID <= (size_t)(U32)-1);
             cdict->dictID = (U32)dictID;
@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
     params.cParams = cParams;
     params.useRowMatchFinder = useRowMatchFinder;
     cdict->useRowMatchFinder = useRowMatchFinder;
+    cdict->compressionLevel = ZSTD_NO_CLEVEL;
 
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                               dict, dictSize,
@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
 
 /* ZSTD_compressBegin_usingCDict() :
  * cdict must be != NULL */
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
 {
     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
+}
+
 /*! ZSTD_compress_usingCDict_internal():
  * Implementation of various ZSTD_compress_usingCDict* functions.
  */
@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
 {
     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 /*! ZSTD_compress_usingCDict_advanced():
@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
 
 static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
 {
-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
-    if (hintInSize==0) hintInSize = cctx->blockSize;
-    return hintInSize;
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        return cctx->blockSize - cctx->stableIn_notConsumed;
+    }
+    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
+    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+        if (hintInSize==0) hintInSize = cctx->blockSize;
+        return hintInSize;
+    }
 }
 
 /* ZSTD_compressStream_generic():
  *  internal function for all *compressStream*() variants
- *  non-static, because can be called from zstdmt_compress.c
- * @return : hint size for next input */
+ * @return : hint size for next input to complete ongoing block */
 static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                           ZSTD_outBuffer* output,
                                           ZSTD_inBuffer* input,
                                           ZSTD_EndDirective const flushMode)
 {
-    const char* const istart = (const char*)input->src;
-    const char* const iend = input->size != 0 ? istart + input->size : istart;
-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
-    char* const ostart = (char*)output->dst;
-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
+    const char* const istart = (assert(input != NULL), (const char*)input->src);
+    const char* const iend = (istart != NULL) ? istart + input->size : istart;
+    const char* ip = (istart != NULL) ? istart + input->pos : istart;
+    char* const ostart = (assert(output != NULL), (char*)output->dst);
+    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
+    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
     U32 someMoreWork = 1;
 
     /* check expectations */
-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
+    assert(zcs != NULL);
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        assert(input->pos >= zcs->stableIn_notConsumed);
+        input->pos -= zcs->stableIn_notConsumed;
+        ip -= zcs->stableIn_notConsumed;
+        zcs->stableIn_notConsumed = 0;
+    }
     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
         assert(zcs->inBuff != NULL);
         assert(zcs->inBuffSize > 0);
@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
         assert(zcs->outBuff !=  NULL);
         assert(zcs->outBuffSize > 0);
     }
-    assert(output->pos <= output->size);
+    if (input->src == NULL) assert(input->size == 0);
     assert(input->pos <= input->size);
+    if (output->dst == NULL) assert(output->size == 0);
+    assert(output->pos <= output->size);
     assert((U32)flushMode <= (U32)ZSTD_e_end);
 
     while (someMoreWork) {
@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
               && (zcs->inBuffPos == 0) ) {
                 /* shortcut to compression pass directly into output buffer */
-                size_t const cSize = ZSTD_compressEnd(zcs,
+                size_t const cSize = ZSTD_compressEnd_public(zcs,
                                                 op, oend-op, ip, iend-ip);
                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                         zcs->inBuff + zcs->inBuffPos, toLoad,
                                         ip, iend-ip);
                 zcs->inBuffPos += loaded;
-                if (loaded != 0)
-                    ip += loaded;
+                if (ip) ip += loaded;
                 if ( (flushMode == ZSTD_e_continue)
                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
                     /* not enough input to fill full block : stop here */
@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     /* empty */
                     someMoreWork = 0; break;
                 }
+            } else {
+                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+                if ( (flushMode == ZSTD_e_continue)
+                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
+                    /* can't compress a full block : stop here */
+                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
+                    ip = iend;  /* pretend to have consumed input */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (ip == iend) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
             }
             /* compress current block (note : this stage cannot be stopped in the middle) */
             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 void* cDst;
                 size_t cSize;
                 size_t oSize = oend-op;
-                size_t const iSize = inputBuffered
-                    ? zcs->inBuffPos - zcs->inToCompress
-                    : MIN((size_t)(iend - ip), zcs->blockSize);
+                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
+                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
                     cDst = op;   /* compress into output buffer, to skip flush stage */
                 else
@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 if (inputBuffered) {
                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize,
+                            ZSTD_compressEnd_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize,
+                            ZSTD_compressContinue_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize);
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     if (!lastBlock)
                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
                     zcs->inToCompress = zcs->inBuffPos;
-                } else {
-                    unsigned const lastBlock = (ip + iSize == iend);
-                    assert(flushMode == ZSTD_e_end /* Already validated */);
+                } else { /* !inputBuffered, hence ZSTD_bm_stable */
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
+                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
                     /* Consume the input prior to error checking to mirror buffered mode. */
-                    if (iSize > 0)
-                        ip += iSize;
+                    if (ip) ip += iSize;
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
-                    if (lastBlock)
-                        assert(ip == iend);
+                    if (lastBlock) assert(ip == iend);
                 }
                 if (cDst == op) {  /* no need to flush */
                     op += cSize;
@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
 /* After a compression call set the expected input/output buffer.
  * This is validated at the start of the next compression call.
  */
-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
+static void
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
 {
+    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         cctx->expectedInBuffer = *input;
     }
@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
 {
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
-        if (endOp != ZSTD_e_end)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
+        if (expect.src != input->src || expect.pos != input->pos)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
     }
+    (void)endOp;
     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
         size_t const outBufferSize = output->size - output->pos;
         if (cctx->expectedOutBufferSize != outBufferSize)
-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
     }
     return 0;
 }
 
 static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
                                              ZSTD_EndDirective endOp,
-                                             size_t inSize) {
+                                             size_t inSize)
+{
     ZSTD_CCtx_params params = cctx->requestedParams;
     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
         params.compressionLevel = cctx->cdict->compressionLevel;
     }
     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
-    {
-        size_t const dictSize = prefixDict.dict
+    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
+
+    {   size_t const dictSize = prefixDict.dict
                 ? prefixDict.dictSize
                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
+    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
+    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
 
     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     return 0;
 }
 
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
+ */
 size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                              ZSTD_outBuffer* output,
                              ZSTD_inBuffer* input,
@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
 
     /* transparent initialization stage */
     if (cctx->streamStage == zcss_init) {
-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
+        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
+        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
+        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
+          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
+          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
+            if (cctx->stableIn_notConsumed) {  /* not the first time */
+                /* check stable source guarantees */
+                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
+                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
+            }
+            /* pretend input was consumed, to give a sense forward progress */
+            input->pos = input->size;
+            /* save stable inBuffer, for later control, and flush/end */
+            cctx->expectedInBuffer = *input;
+            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
+            cctx->stableIn_notConsumed += inputSize;
+            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
+            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
+        }
+        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
+        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
     }
     /* end of transparent initialization stage */
 
@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
                       const void* src, size_t srcSize, size_t* srcPos,
                             ZSTD_EndDirective endOp)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
 
 size_t ZSTD_compress2(ZSTD_CCtx* cctx,
@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
         /* Reset to the original values. */
         cctx->requestedParams.inBufferMode = originalInBufferMode;
         cctx->requestedParams.outBufferMode = originalOutBufferMode;
+
         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
         if (result != 0) {  /* compression not completed, due to lack of output space */
             assert(oPos == dstCapacity);
@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
     }
 }
 
-typedef struct {
-    U32 idx;             /* Index in array of ZSTD_Sequence */
-    U32 posInSequence;   /* Position within sequence at idx */
-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
-} ZSTD_sequencePosition;
-
 /* ZSTD_validateSequence() :
  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
  * @returns a ZSTD error code if sequence is not valid
  */
 static size_t
-ZSTD_validateSequence(U32 offCode, U32 matchLength,
-                      size_t posInSrc, U32 windowLog, size_t dictSize)
+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
+                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
 {
-    U32 const windowSize = 1 << windowLog;
+    U32 const windowSize = 1u << windowLog;
     /* posInSrc represents the amount of data the decoder would decode up to this point.
      * As long as the amount of data decoded is less than or equal to window size, offsets may be
      * larger than the total length of output decoded in order to reference the dict, even larger than
      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
      */
     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
+    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
+    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
+    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
+    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
     return 0;
 }
 
 /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
 {
-    U32 offCode = STORE_OFFSET(rawOffset);
+    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
 
     if (!ll0 && rawOffset == rep[0]) {
-        offCode = STORE_REPCODE_1;
+        offBase = REPCODE1_TO_OFFBASE;
     } else if (rawOffset == rep[1]) {
-        offCode = STORE_REPCODE(2 - ll0);
+        offBase = REPCODE_TO_OFFBASE(2 - ll0);
     } else if (rawOffset == rep[2]) {
-        offCode = STORE_REPCODE(3 - ll0);
+        offBase = REPCODE_TO_OFFBASE(3 - ll0);
     } else if (ll0 && rawOffset == rep[0] - 1) {
-        offCode = STORE_REPCODE_3;
+        offBase = REPCODE3_TO_OFFBASE;
     }
-    return offCode;
+    return offBase;
 }
 
-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
                                               ZSTD_sequencePosition* seqPos,
                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                        const void* src, size_t blockSize)
+                                        const void* src, size_t blockSize,
+                                        ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
+    U32 const startIdx = idx;
     BYTE const* ip = (BYTE const*)(src);
     const BYTE* const iend = ip + blockSize;
     repcodes_t updatedRepcodes;
     U32 dictSize;
 
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
+
     if (cctx->cdict) {
         dictSize = (U32)cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         dictSize = 0;
     }
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
+    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
         U32 const litLength = inSeqs[idx].litLength;
-        U32 const ll0 = (litLength == 0);
         U32 const matchLength = inSeqs[idx].matchLength;
-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+        U32 offBase;
 
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+        if (externalRepSearch == ZSTD_ps_disable) {
+            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
+        } else {
+            U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
+
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
                                                 "Sequence validation failed");
         }
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
     }
+
+    /* If we skipped repcode search while parsing, we need to update repcodes now */
+    assert(externalRepSearch != ZSTD_ps_auto);
+    assert(idx >= startIdx);
+    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
+        U32* const rep = updatedRepcodes.rep;
+        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
+
+        if (lastSeqIdx >= startIdx + 2) {
+            rep[2] = inSeqs[lastSeqIdx - 2].offset;
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else if (lastSeqIdx == startIdx + 1) {
+            rep[2] = rep[0];
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else {
+            assert(lastSeqIdx == startIdx);
+            rep[2] = rep[1];
+            rep[1] = rep[0];
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        }
+    }
+
     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
 
     if (inSeqs[idx].litLength) {
@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         ip += inSeqs[idx].litLength;
         seqPos->posInSrc += inSeqs[idx].litLength;
     }
-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
+    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
     seqPos->idx = idx+1;
     return 0;
 }
 
-/* Returns the number of bytes to move the current read position back by. Only non-zero
- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
- * went wrong.
- *
- * This function will attempt to scan through blockSize bytes represented by the sequences
- * in inSeqs, storing any (partial) sequences.
- *
- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                   const void* src, size_t blockSize)
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
     U32 startPosInSequence = seqPos->posInSequence;
@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     U32 bytesAdjustment = 0;
     U32 finalMatchSplit = 0;
 
+    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
+    (void)externalRepSearch;
+
     if (cctx->cdict) {
         dictSize = cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     } else {
         dictSize = 0;
     }
-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         U32 litLength = currSeq.litLength;
         U32 matchLength = currSeq.matchLength;
         U32 const rawOffset = currSeq.offset;
-        U32 offCode;
+        U32 offBase;
 
         /* Modify the sequence depending on where endPosInSequence lies */
         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
             /* Move to the next sequence */
             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
             startPosInSequence = 0;
-            idx++;
         } else {
             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
                does not reach the end of the match. So, we have to split the sequence */
@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         }
         /* Check if this offset can be represented with a repcode */
         {   U32 const ll0 = (litLength == 0);
-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
         }
 
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
                                                    "Sequence validation failed");
         }
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
+        if (!finalMatchSplit)
+            idx++; /* Next Sequence */
     }
     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
 
 typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                       const void* src, size_t blockSize);
+                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
 static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
 {
     ZSTD_sequenceCopier sequenceCopier = NULL;
@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
     return sequenceCopier;
 }
 
+/* Discover the size of next block by searching for the delimiter.
+ * Note that a block delimiter **must** exist in this mode,
+ * otherwise it's an input error.
+ * The block size retrieved will be later compared to ensure it remains within bounds */
+static size_t
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    int end = 0;
+    size_t blockSize = 0;
+    size_t spos = seqPos.idx;
+    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
+    assert(spos <= inSeqsSize);
+    while (spos < inSeqsSize) {
+        end = (inSeqs[spos].offset == 0);
+        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
+        if (end) {
+            if (inSeqs[spos].matchLength != 0)
+                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
+            break;
+        }
+        spos++;
+    }
+    if (!end)
+        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
+    return blockSize;
+}
+
+/* More a "target" block size */
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
+{
+    int const lastBlock = (remaining <= blockSize);
+    return lastBlock ? remaining : blockSize;
+}
+
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
+                           size_t blockSize, size_t remaining,
+                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
+    if (mode == ZSTD_sf_noBlockDelimiters)
+        return blockSize_noDelimiter(blockSize, remaining);
+    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
+        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
+        if (explicitBlockSize > blockSize)
+            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
+        if (explicitBlockSize > remaining)
+            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
+        return explicitBlockSize;
+    }
+}
+
 /* Compress, block-by-block, all of the sequences given.
  *
  * Returns the cumulative size of all compressed blocks (including their headers),
@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                           const void* src, size_t srcSize)
 {
     size_t cSize = 0;
-    U32 lastBlock;
-    size_t blockSize;
-    size_t compressedSeqsSize;
     size_t remaining = srcSize;
     ZSTD_sequencePosition seqPos = {0, 0, 0};
 
@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
     }
 
     while (remaining) {
+        size_t compressedSeqsSize;
         size_t cBlockSize;
         size_t additionalByteAdjustment;
-        lastBlock = remaining <= cctx->blockSize;
-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
+        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
+                                        cctx->blockSize, remaining,
+                                        inSeqs, inSeqsSize, seqPos);
+        U32 const lastBlock = (blockSize == remaining);
+        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
+        assert(blockSize <= remaining);
         ZSTD_resetSeqStore(&cctx->seqStore);
-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
+        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
 
-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
         blockSize -= additionalByteAdjustment;
 
         /* If blocks are too small, emit as a nocompress block */
-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
             cSize += cBlockSize;
             ip += blockSize;
             op += cBlockSize;
@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             continue;
         }
 
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
                                 &cctx->appliedParams,
@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
                                 cctx->bmi2);
         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
+        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
 
         if (!cctx->isFirstBlock &&
             ZSTD_maybeRLE(&cctx->seqStore) &&
-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+            ZSTD_isRLE(ip, blockSize)) {
             /* We don't want to emit our first block as a RLE even if it qualifies because
             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
             * This is only an issue for zstd <= v1.4.3
@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
         if (compressedSeqsSize == 0) {
             /* ZSTD_noCompressBlock writes the block header as well */
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
         } else if (compressedSeqsSize == 1) {
             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
+            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
         } else {
             U32 cBlockHeader;
             /* Error checking and repcodes update */
@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
             MEM_writeLE24(op, cBlockHeader);
             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
+            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
         }
 
         cSize += cBlockSize;
-        DEBUGLOG(4, "cSize running total: %zu", cSize);
 
         if (lastBlock) {
             break;
@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             dstCapacity -= cBlockSize;
             cctx->isFirstBlock = 0;
         }
+        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
     }
 
+    DEBUGLOG(4, "cSize final total: %zu", cSize);
     return cSize;
 }
 
-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
                               const void* src, size_t srcSize)
 {
@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
     size_t frameHeaderSize = 0;
 
     /* Transparent initialization stage, same as compressStream2() */
-    DEBUGLOG(3, "ZSTD_compressSequences()");
+    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
     assert(cctx != NULL);
     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
     /* Begin writing output, starting with frame header */
@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
         cSize += 4;
     }
 
-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
+    DEBUGLOG(4, "Final compressed size: %zu", cSize);
     return cSize;
 }
 
 /*======   Finalize   ======*/
 
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
+{
+    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
+    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+    return stableInput ? zcs->expectedInBuffer : nullInput;
+}
+
 /*! ZSTD_flushStream() :
  * @return : amount of data remaining to flush */
 size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    input.size = input.pos; /* do not ingest more input during flush */
     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
 }
 
 
 size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
+    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
     /* single thread mode : attempt to calculate remaining to flush more precisely */
     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
             cp.targetLength = (unsigned)(-clampedCompressionLevel);
         }
         /* refine parameters based on srcSize & dictSize */
-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
     }
 }
 
@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
 }
+
+void ZSTD_registerSequenceProducer(
+    ZSTD_CCtx* zc, void* mState,
+    ZSTD_sequenceProducer_F* mFinder
+) {
+    if (mFinder != NULL) {
+        ZSTD_externalMatchCtx emctx;
+        emctx.mState = mState;
+        emctx.mFinder = mFinder;
+        emctx.seqBuffer = NULL;
+        emctx.seqBufferCapacity = 0;
+        zc->externalMatchCtx = emctx;
+        zc->requestedParams.useSequenceProducer = 1;
+    } else {
+        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
+        zc->requestedParams.useSequenceProducer = 0;
+    }
+}
diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
index 71697a11ae30..899f5e2de8e9 100644
--- a/lib/zstd/compress/zstd_compress_internal.h
+++ b/lib/zstd/compress/zstd_compress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,6 +21,7 @@
 ***************************************/
 #include "../common/zstd_internal.h"
 #include "zstd_cwksp.h"
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
 
 
 /*-*************************************
@@ -111,12 +113,13 @@ typedef struct {
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  @return : 0 on success or error code */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize);
+size_t ZSTD_buildBlockEntropyStats(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                          void* workspace, size_t wkspSize);
 
 /* *******************************
 *  Compression internals structs *
@@ -142,6 +145,12 @@ typedef struct {
   size_t capacity;      /* The capacity starting from `seq` pointer */
 } rawSeqStore_t;
 
+typedef struct {
+    U32 idx;            /* Index in array of ZSTD_Sequence */
+    U32 posInSequence;  /* Position within sequence at idx */
+    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
+} ZSTD_sequencePosition;
+
 UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
 
 typedef struct {
@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
 
     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
+    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
 
     U32* hashTable;
     U32* hashTable3;
@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
     const ZSTD_matchState_t* dictMatchState;
     ZSTD_compressionParameters cParams;
     const rawSeqStore_t* ldmSeqStore;
+
+    /* Controls prefetching in some dictMatchState matchfinders.
+     * This behavior is controlled from the cctx ms.
+     * This parameter has no effect in the cdict ms. */
+    int prefetchCDictTables;
+
+    /* When == 0, lazy match finders insert every position.
+     * When != 0, lazy match finders only insert positions they search.
+     * This allows them to skip much faster over incompressible data,
+     * at a small cost to compression ratio.
+     */
+    int lazySkipping;
 };
 
 typedef struct {
@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
 
     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
     ZSTD_customMem customMem;
+
+    /* Controls prefetching in some dictMatchState matchfinders */
+    ZSTD_paramSwitch_e prefetchCDictTables;
+
+    /* Controls whether zstd will fall back to an internal matchfinder
+     * if the external matchfinder returns an error code. */
+    int enableMatchFinderFallback;
+
+    /* Indicates whether an external matchfinder has been referenced.
+     * Users can't set this externally.
+     * It is set internally in ZSTD_registerSequenceProducer(). */
+    int useSequenceProducer;
+
+    /* Adjust the max block size*/
+    size_t maxBlockSize;
+
+    /* Controls repcode search in external sequence parsing */
+    ZSTD_paramSwitch_e searchForExternalRepcodes;
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
 
 #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
@@ -355,6 +396,14 @@ typedef struct {
     ZSTD_entropyCTablesMetadata_t entropyMetadata;
 } ZSTD_blockSplitCtx;
 
+/* Context for block-level external matchfinder API */
+typedef struct {
+  void* mState;
+  ZSTD_sequenceProducer_F* mFinder;
+  ZSTD_Sequence* seqBuffer;
+  size_t seqBufferCapacity;
+} ZSTD_externalMatchCtx;
+
 struct ZSTD_CCtx_s {
     ZSTD_compressionStage_e stage;
     int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
 
     /* Stable in/out buffer verification */
     ZSTD_inBuffer expectedInBuffer;
+    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
     size_t expectedOutBufferSize;
 
     /* Dictionary */
@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
 
     /* Workspace for block splitter */
     ZSTD_blockSplitCtx blockSplitCtx;
+
+    /* Workspace for external matchfinder */
+    ZSTD_externalMatchCtx externalMatchCtx;
 };
 
 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
 
 typedef enum {
     ZSTD_noDict = 0,
@@ -441,7 +495,7 @@ typedef enum {
                                  * In this mode we take both the source size and the dictionary size
                                  * into account when selecting and adjusting the parameters.
                                  */
-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
                                  * We don't know what these parameters are for. We default to the legacy
                                  * behavior of taking both the source size and the dict size into account
                                  * when selecting and adjusting parameters.
@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
 /* ZSTD_noCompressBlock() :
  * Writes uncompressed block to dst buffer from given src.
  * Returns the size of the block */
-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
                     dstSize_tooSmall, "dst buf too small for uncompressed block");
     MEM_writeLE24(dst, cBlockHeader24);
@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
     return ZSTD_blockHeaderSize + srcSize;
 }
 
-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
 {
     BYTE* const op = (BYTE*)dst;
     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
 {
     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
     return (srcSize >> minlog) + 2;
 }
 
@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
     while (ip < iend) *op++ = *ip++;
 }
 
-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
-#define STORE_REPCODE_1 STORE_REPCODE(1)
-#define STORE_REPCODE_2 STORE_REPCODE(2)
-#define STORE_REPCODE_3 STORE_REPCODE(3)
-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
-#define STORED_TO_OFFBASE(o) ((o)+1)
-#define OFFBASE_TO_STORED(o) ((o)-1)
+
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
+#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
+#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
+#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
 
 /*! ZSTD_storeSeq() :
- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
+ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
+ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
  *  @matchLength : must be >= MINMATCH
- *  Allowed to overread literals up to litLimit.
+ *  Allowed to over-read literals up to litLimit.
 */
 HINT_INLINE UNUSED_ATTR void
 ZSTD_storeSeq(seqStore_t* seqStorePtr,
               size_t litLength, const BYTE* literals, const BYTE* litLimit,
-              U32 offBase_minus1,
+              U32 offBase,
               size_t matchLength)
 {
     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
     }
 #endif
     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     assert(literals + litLength <= litLimit);
     if (litEnd <= litLimit_w) {
         /* Common case we can use wildcopy.
-	 * First copy 16 bytes, because literals are likely short.
-	 */
-        assert(WILDCOPY_OVERLENGTH >= 16);
+         * First copy 16 bytes, because literals are likely short.
+         */
+        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
         ZSTD_copy16(seqStorePtr->lit, literals);
         if (litLength > 16) {
             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     seqStorePtr->sequences[0].litLength = (U16)litLength;
 
     /* match offset */
-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
+    seqStorePtr->sequences[0].offBase = offBase;
 
     /* match Length */
     assert(matchLength >= MINMATCH);
@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
 
 /* ZSTD_updateRep() :
  * updates in-place @rep (array of repeat offsets)
- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
  */
 MEM_STATIC void
-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
+    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
         rep[2] = rep[1];
         rep[1] = rep[0];
-        rep[0] = STORED_OFFSET(offBase_minus1);
+        rep[0] = OFFBASE_TO_OFFSET(offBase);
     } else {   /* repcode */
-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
+        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
         if (repCode > 0) {  /* note : if repCode==0, no change */
             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
@@ -673,11 +728,11 @@ typedef struct repcodes_s {
 } repcodes_t;
 
 MEM_STATIC repcodes_t
-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
     repcodes_t newReps;
     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
+    ZSTD_updateRep(newReps.rep, offBase, ll0);
     return newReps;
 }
 
@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
 /*-*************************************
 *  Match length counter
 ***************************************/
-static unsigned ZSTD_NbCommonBytes (size_t val)
-{
-    if (MEM_isLittleEndian()) {
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
-                                                     0, 3, 1, 3, 1, 4, 2, 7,
-                                                     0, 2, 3, 6, 1, 5, 3, 5,
-                                                     1, 3, 4, 4, 2, 5, 6, 7,
-                                                     7, 0, 1, 2, 3, 3, 4, 6,
-                                                     2, 6, 5, 5, 3, 4, 5, 6,
-                                                     7, 1, 2, 4, 6, 4, 4, 5,
-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
-                                                     3, 2, 2, 1, 3, 2, 0, 1,
-                                                     3, 3, 1, 2, 2, 2, 2, 0,
-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    } else {  /* Big Endian CPU */
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_clzll(val) >> 3);
-#       else
-            unsigned r;
-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_clz((U32)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-    }   }
-}
-
-
 MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
 {
     const BYTE* const pStart = pIn;
@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
  *  Hashes
  ***************************************/
 static const U32 prime3bytes = 506832829U;
-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
 
 static const U32 prime4bytes = 2654435761U;
-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
 
 static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
 
 static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
 
 static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
 
 static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
+
 
 MEM_STATIC FORCE_INLINE_ATTR
 size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
 {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
     switch(mls)
     {
     default:
@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
     }
 }
 
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+        default:
+        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
+        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
+        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
+        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
+        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
+    }
+}
+
+
 /* ZSTD_ipow() :
  * Return base^exponent.
  */
@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
         assert(blockEndIdx >= loadedDictEnd);
 
-        if (blockEndIdx > loadedDictEnd + maxDist) {
+        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
             /* On reaching window size, dictionaries are invalidated.
              * For simplification, if window size is reached anywhere within next block,
              * the dictionary is invalidated for the full block.
+             *
+             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
+             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
+             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
+             * dictMatchState, so setting it to NULL is not a problem.
              */
             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
             *loadedDictEndPtr = 0;
@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
 
 #endif
 
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
 
 
 /* ===============================================================
@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
  */
 void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
 
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+ * Note that the block delimiter must include the last literals of the block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns the number of bytes to move the current read position back by.
+ * Only non-zero if we ended up splitting a sequence.
+ * Otherwise, it may return a ZSTD error if something went wrong.
+ *
+ * This function will attempt to scan through blockSize bytes
+ * represented by the sequences in @inSeqs,
+ * storing any (partial) sequences.
+ *
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+
+/* ===============================================================
+ * Deprecated definitions that are still used internally to avoid
+ * deprecation warnings. These functions are exactly equivalent to
+ * their public variants, but avoid the deprecation warnings.
+ * =============================================================== */
+
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize);
+
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
 #endif /* ZSTD_COMPRESS_H */
diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
index 52b0a8059aba..3e9ea46a670a 100644
--- a/lib/zstd/compress/zstd_compress_literals.c
+++ b/lib/zstd/compress/zstd_compress_literals.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -13,11 +14,36 @@
  ***************************************/
 #include "zstd_compress_literals.h"
 
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(5, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(5, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
 
     switch(flSize)
@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
     }
 
     ZSTD_memcpy(ostart + flSize, src, srcSize);
-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
     return srcSize + flSize;
 }
 
+static int allBytesIdentical(const void* src, size_t srcSize)
+{
+    assert(srcSize >= 1);
+    assert(src != NULL);
+    {   const BYTE b = ((const BYTE*)src)[0];
+        size_t p;
+        for (p=1; p<srcSize; p++) {
+            if (((const BYTE*)src)[p] != b) return 0;
+        }
+        return 1;
+    }
+}
+
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+    assert(dstCapacity >= 4); (void)dstCapacity;
+    assert(allBytesIdentical(src, srcSize));
 
     switch(flSize)
     {
@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
     }
 
     ostart[flSize] = *(const BYTE*)src;
-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
     return flSize+1;
 }
 
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize,
-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible)
+/* ZSTD_minLiteralsToCompress() :
+ * returns minimal amount of literals
+ * for literal compression to even be attempted.
+ * Minimum is made tighter as compression strategy increases.
+ */
+static size_t
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
+{
+    assert((int)strategy >= 0);
+    assert((int)strategy <= 9);
+    /* btultra2 : min 8 bytes;
+     * then 2x larger for each successive compression strategy
+     * max threshold 64 bytes */
+    {   int const shift = MIN(9-(int)strategy, 3);
+        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
+        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
+        return mintc;
+    }
+}
+
+size_t ZSTD_compressLiterals (
+                  void* dst, size_t dstCapacity,
+            const void* src, size_t srcSize,
+                  void* entropyWorkspace, size_t entropyWorkspaceSize,
+            const ZSTD_hufCTables_t* prevHuf,
+                  ZSTD_hufCTables_t* nextHuf,
+                  ZSTD_strategy strategy,
+                  int disableLiteralCompression,
+                  int suspectUncompressible,
+                  int bmi2)
 {
-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
     BYTE*  const ostart = (BYTE*)dst;
     U32 singleStream = srcSize < 256;
     symbolEncodingType_e hType = set_compressed;
     size_t cLitSize;
 
-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
-                disableLiteralCompression, (U32)srcSize);
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
 
     /* Prepare nextEntropy assuming reusing the existing table */
     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     if (disableLiteralCompression)
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
-    /* small ? don't even attempt compression (speed opt) */
-#   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    /* if too small, don't even attempt compression (speed opt) */
+    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
     {   HUF_repeat repeat = prevHuf->repeatMode;
-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        int const flags = 0
+            | (bmi2 ? HUF_flags_bmi2 : 0)
+            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
+            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
+            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
+
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
+        huf_compress_f huf_compress;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
-        cLitSize = singleStream ?
-            HUF_compress1X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
-            HUF_compress4X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, flags);
+        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
         if (repeat != HUF_repeat_none) {
             /* reused the existing table */
-            DEBUGLOG(5, "Reusing previous huffman table");
+            DEBUGLOG(5, "reusing statistics from previous huffman block");
             hType = set_repeat;
         }
     }
 
-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
+        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }   }
     if (cLitSize==1) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
-    }
+        /* A return value of 1 signals that the alphabet consists of a single symbol.
+         * However, in some rare circumstances, it could be the compressed size (a single byte).
+         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
+         * (it's also necessary to not generate statistics).
+         * Therefore, in such a case, actively check that all bytes are identical. */
+        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }   }
 
     if (hType == set_compressed) {
         /* using a newly constructed table */
@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     switch(lhSize)
     {
     case 3: /* 2 - 2 - 10 - 10 */
-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
             MEM_writeLE24(ostart, lhc);
             break;
         }
     case 4: /* 2 - 2 - 14 - 14 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
             MEM_writeLE32(ostart, lhc);
             break;
         }
     case 5: /* 2 - 2 - 18 - 18 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
             MEM_writeLE32(ostart, lhc);
             ostart[4] = (BYTE)(cLitSize >> 10);
diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
index 9775fb97cb70..a2a85d6b69e5 100644
--- a/lib/zstd/compress/zstd_compress_literals.h
+++ b/lib/zstd/compress/zstd_compress_literals.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,16 +17,24 @@
 
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
+/* ZSTD_compressRleLiteralsBlock() :
+ * Conditions :
+ * - All bytes in @src are identical
+ * - dstCapacity >= 4 */
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
+/* ZSTD_compressLiterals():
+ * @entropyWorkspace: must be aligned on 4-bytes boundaries
+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
+ */
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize,
                               void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible);
+                        const ZSTD_hufCTables_t* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              int suspectUncompressible,
+                              int bmi2);
 
 #endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
index 21ddc1b37acf..5c028c78d889 100644
--- a/lib/zstd/compress/zstd_compress_sequences.c
+++ b/lib/zstd/compress/zstd_compress_sequences.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
 {
     /* Heuristic: This should cover most blocks <= 16K and
      * start to fade out after 16K to about 32K depending on
-     * comprssibility.
+     * compressibility.
      */
     return nbSeq >= 2048;
 }
@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
     if (mostFrequent == nbSeq) {
         *repeatMode = FSE_repeat_none;
         if (isDefaultAllowed && nbSeq <= 2) {
-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
              * If basic encoding isn't possible, always choose RLE.
              */
diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
index 7991364c2f71..7fe6f4ff5cf2 100644
--- a/lib/zstd/compress/zstd_compress_sequences.h
+++ b/lib/zstd/compress/zstd_compress_sequences.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
index 17d836cc84e8..dbacbaf72733 100644
--- a/lib/zstd/compress/zstd_compress_superblock.c
+++ b/lib/zstd/compress/zstd_compress_superblock.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -36,13 +37,14 @@
  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
  *  @return : compressed size of literals section of a sub-block
- *            Or 0 if it unable to compress.
+ *            Or 0 if unable to compress.
  *            Or error code */
-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                    const BYTE* literals, size_t litSize,
-                                    void* dst, size_t dstSize,
-                                    const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
 {
     size_t const header = writeEntropy ? 200 : 0;
     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
     size_t cLitSize = 0;
 
-    (void)bmi2; /* TODO bmi2... */
-
     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
 
     *entropyWritten = 0;
@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
     }
 
-    /* TODO bmi2 */
-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
         op += cSize;
         cLitSize += cSize;
         if (cSize == 0 || ERR_isError(cSize)) {
@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     return op-ostart;
 }
 
-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeq,
+                         size_t litSize, int lastSequence)
+{
     const seqDef* const sstart = sequences;
     const seqDef* const send = sequences + nbSeq;
     const seqDef* sp = sstart;
@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
  *  @return : compressed size of sequences section of a sub-block
  *            Or 0 if it is unable to compress
  *            Or error code. */
-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                              const seqDef* sequences, size_t nbSeq,
-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                              void* dst, size_t dstCapacity,
-                                              const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
 {
     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     BYTE* const ostart = (BYTE*)dst;
@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
             repcodes_t rep;
             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
             for (seq = sstart; seq < sp; ++seq) {
-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
             }
             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
         }
diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
index 224ece79546e..826bbc9e029b 100644
--- a/lib/zstd/compress/zstd_compress_superblock.h
+++ b/lib/zstd/compress/zstd_compress_superblock.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
index 349fc923c355..65ea53b62844 100644
--- a/lib/zstd/compress/zstd_cwksp.h
+++ b/lib/zstd/compress/zstd_cwksp.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,9 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_internal.h"
+#include "../common/portability_macros.h"
 
 
 /*-*************************************
@@ -41,8 +44,9 @@
 ***************************************/
 typedef enum {
     ZSTD_cwksp_alloc_objects,
-    ZSTD_cwksp_alloc_buffers,
-    ZSTD_cwksp_alloc_aligned
+    ZSTD_cwksp_alloc_aligned_init_once,
+    ZSTD_cwksp_alloc_aligned,
+    ZSTD_cwksp_alloc_buffers
 } ZSTD_cwksp_alloc_phase_e;
 
 /*
@@ -95,8 +99,8 @@ typedef enum {
  *
  * Workspace Layout:
  *
- * [                        ... workspace ...                         ]
- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ * [                        ... workspace ...                           ]
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
  *
  * The various objects that live in the workspace are divided into the
  * following categories, and are allocated separately:
@@ -120,9 +124,18 @@ typedef enum {
  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
  *
- * - Aligned: these buffers are used for various purposes that require 4 byte
- *   alignment, but don't require any initialization before they're used. These
- *   buffers are each aligned to 64 bytes.
+ * - Init once: these buffers require to be initialized at least once before
+ *   use. They should be used when we want to skip memory initialization
+ *   while not triggering memory checkers (like Valgrind) when reading from
+ *   from this memory without writing to it first.
+ *   These buffers should be used carefully as they might contain data
+ *   from previous compressions.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Aligned: these buffers don't require any initialization before they're
+ *   used. The user of the buffer should make sure they write into a buffer
+ *   location before reading from it.
+ *   Buffers are aligned to 64 bytes.
  *
  * - Buffers: these buffers are used for various purposes that don't require
  *   any alignment or initialization before they're used. This means they can
@@ -134,8 +147,9 @@ typedef enum {
  * correctly packed into the workspace buffer. That order is:
  *
  * 1. Objects
- * 2. Buffers
- * 3. Aligned/Tables
+ * 2. Init once / Tables
+ * 3. Aligned / Tables
+ * 4. Buffers / Tables
  *
  * Attempts to reserve objects of different types out of order will fail.
  */
@@ -147,6 +161,7 @@ typedef struct {
     void* tableEnd;
     void* tableValidEnd;
     void* allocStart;
+    void* initOnceStart;
 
     BYTE allocFailed;
     int workspaceOversizedDuration;
@@ -159,6 +174,7 @@ typedef struct {
 ***************************************/
 
 MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
 
 MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     (void)ws;
@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     assert(ws->tableEnd <= ws->allocStart);
     assert(ws->tableValidEnd <= ws->allocStart);
     assert(ws->allocStart <= ws->workspaceEnd);
+    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
+    assert(ws->workspace <= ws->initOnceStart);
 }
 
 /*
@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
  * for internal purposes (currently only alignment).
  */
 MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
-     * to align the beginning of the aligned section.
-     *
-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
-     * aligneds being sized in multiples of 64 bytes.
+    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
+     * bytes to align the beginning of tables section and end of buffers;
      */
-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
     return slackSpace;
 }
 
@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
     size_t const alignBytesMask = alignBytes - 1;
     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
     assert((alignBytes & alignBytesMask) == 0);
-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
+    assert(bytes < alignBytes);
     return bytes;
 }
 
+/*
+ * Returns the initial value for allocStart which is used to determine the position from
+ * which we can allocate from the end of the workspace.
+ */
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
+    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
+}
+
 /*
  * Internal function. Do not use directly.
  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
 {
     assert(phase >= ws->phase);
     if (phase > ws->phase) {
-        /* Going from allocating objects to allocating buffers */
-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
-                phase >= ZSTD_cwksp_alloc_buffers) {
+        /* Going from allocating objects to allocating initOnce / tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
+            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
             ws->tableValidEnd = ws->objectEnd;
-        }
+            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
 
-        /* Going from allocating buffers to allocating aligneds/tables */
-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
-                phase >= ZSTD_cwksp_alloc_aligned) {
-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
-                size_t const bytesToAlign =
-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
-            }
             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
-                void* const alloc = ws->objectEnd;
+                void *const alloc = ws->objectEnd;
                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
+                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
                                 "table phase - alignment initial allocation failed!");
@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
                 ws->tableEnd = objectEnd;  /* table area starts being empty */
                 if (ws->tableValidEnd < ws->tableEnd) {
                     ws->tableValidEnd = ws->tableEnd;
-        }   }   }
+                }
+            }
+        }
         ws->phase = phase;
         ZSTD_cwksp_assert_internal_consistency(ws);
     }
@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
  */
 MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
 {
-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
 }
 
 /*
@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
 }
 
+/*
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ * This memory has been initialized at least once in the past.
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
+ * operations.
+ * The main usage is for algorithms that might need read access into uninitialized memory.
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
+ * leak any of the past data (directly or in side channels).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    if(ptr && ptr < ws->initOnceStart) {
+        /* We assume the memory following the current allocation is either:
+         * 1. Not usable as initOnce memory (end of workspace)
+         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
+         * 3. An ASAN redzone, in which case we don't want to write on it
+         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
+         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
+        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
+        ws->initOnceStart = ptr;
+    }
+    return ptr;
+}
+
 /*
  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
  */
@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
  */
 MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
 {
-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
     void* alloc;
     void* end;
     void* top;
 
-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
-        return NULL;
+    /* We can only start allocating tables after we are done reserving space for objects at the
+     * start of the workspace */
+    if(ws->phase < phase) {
+        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+            return NULL;
+        }
     }
     alloc = ws->tableEnd;
     end = (BYTE *)alloc + bytes;
@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
     assert(ws->tableValidEnd >= ws->objectEnd);
     assert(ws->tableValidEnd <= ws->allocStart);
     if (ws->tableValidEnd < ws->tableEnd) {
-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
     }
     ZSTD_cwksp_mark_tables_clean(ws);
 }
@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
 
 
     ws->tableEnd = ws->objectEnd;
-    ws->allocStart = ws->workspaceEnd;
+    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->allocFailed = 0;
-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
-        ws->phase = ZSTD_cwksp_alloc_buffers;
+    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
+        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
     }
     ZSTD_cwksp_assert_internal_consistency(ws);
 }
@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
     ws->workspaceEnd = (BYTE*)start + size;
     ws->objectEnd = ws->workspace;
     ws->tableValidEnd = ws->objectEnd;
+    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->phase = ZSTD_cwksp_alloc_objects;
     ws->isStatic = isStatic;
     ZSTD_cwksp_clear(ws);
@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
  * actual amount of space used.
  */
-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
-                                                        size_t const estimatedSpace, int resizedWorkspace) {
-    if (resizedWorkspace) {
-        /* Resized/newly allocated wksp should have exact bounds */
-        return ZSTD_cwksp_used(ws) == estimatedSpace;
-    } else {
-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
-         */
-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
-    }
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
+    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
+     * the alignment bytes difference between estimation and actual usage */
+    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
+           ZSTD_cwksp_used(ws) <= estimatedSpace;
 }
 
 
diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
index 76933dea2624..ab9440a99603 100644
--- a/lib/zstd/compress/zstd_double_fast.c
+++ b/lib/zstd/compress/zstd_double_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,43 @@
 #include "zstd_compress_internal.h"
 #include "zstd_double_fast.h"
 
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
             /* Only load extra positions for ZSTD_dtlm_full */
             if (dtlm == ZSTD_dtlm_fast)
                 break;
-    }   }
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
 }
 
 
@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t mLength;
     U32 offset;
@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         U32 const current = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
         U32 const maxRep = current - windowLow;
-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
 
     /* Outer Loop: one iteration per match found and stored */
@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
                 ip++;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                 goto _match_stored;
             }
 
@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         } while (ip1 <= ilimit);
 
 _cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
         /* save reps for next block */
-        rep[0] = offset_1 ? offset_1 : offsetSaved;
-        rep[1] = offset_2 ? offset_2 : offsetSaved;
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
         /* Return the last literals size */
         return (size_t)(iend - anchor);
@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             hashLong[hl1] = (U32)(ip1 - base);
         }
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 ip += rLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
-    const U32 dictHBitsL           = dictCParams->hashLog;
-    const U32 dictHBitsS           = dictCParams->chainLog;
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
 
     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     /* if a dictionary is attached, it must be within window range */
     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes)
+        PREFETCH_AREA(dictHashSmall, chainTableBytes)
+    }
+
     /* init */
     ip += (dictAndPrefixLength == 0);
 
@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
         U32 const curr = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
         U32 matchIndexS = hashSmall[h];
@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
             goto _match_stored;
         }
 
@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                 goto _match_found;
             }
-        } else {
+        } else if (dictTagsMatchL) {
             /* check dictMatchState long match */
-            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
             assert(dictMatchL < dictEnd);
 
@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             if (MEM_read32(match) == MEM_read32(ip)) {
                 goto _search_next_long;
             }
-        } else {
+        } else if (dictTagsMatchS) {
             /* check dictMatchState short match */
-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
             match = dictBase + dictMatchIndexS;
             matchIndexS = dictMatchIndexS + dictIndexDelta;
 
@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         continue;
 
 _search_next_long:
-
         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
             U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
             const BYTE* matchL3 = base + matchIndexL3;
             hashLong[hl3] = curr + 1;
 
@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                     goto _match_found;
                 }
-            } else {
+            } else if (dictTagsMatchL3) {
                 /* check dict long +1 match */
-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                 assert(dictMatchL3 < dictEnd);
                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         offset_2 = offset_1;
         offset_1 = offset;
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     }   /* while (ip < ilimit) */
 
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
         } else {
             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 }
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else {
                 ip += ((ip-anchor) >> kSearchStrength) + 1;
@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
index 6822bde65a1d..0204f12e4cf7 100644
--- a/lib/zstd/compress/zstd_double_fast.h
+++ b/lib/zstd/compress/zstd_double_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
index a752e6beab52..3399b39c5dbc 100644
--- a/lib/zstd/compress/zstd_fast.c
+++ b/lib/zstd/compress/zstd_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,42 @@
 #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
 #include "zstd_fast.h"
 
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                         const void* const end,
                         ZSTD_dictTableLoadMethod_e dtlm)
 {
@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
     const U32 fastHashFillStep = 3;
 
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
     /* Always insert every fastHashFillStep position into the hash table.
      * Insert the other positions if their hash entry is empty.
      */
@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     }   }   }   }
 }
 
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
 
 /*
  * If you squint hard enough (and ignore repcodes), the search operation at any
@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
 
     U32 rep_offset1 = rep[0];
     U32 rep_offset2 = rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t hash0; /* hash for ip0 */
     size_t hash1; /* hash for ip1 */
@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
     {   U32 const curr = (U32)(ip0 - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
         U32 const maxRep = curr - windowLow;
-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
     }
 
     /* start each op */
@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
             mLength = ip0[-1] == match0[-1];
             ip0 -= mLength;
             match0 -= mLength;
-            offcode = STORE_REPCODE_1;
+            offcode = REPCODE1_TO_OFFBASE;
             mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _match;
         }
 
@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _offset;
         }
 
@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
             goto _offset;
         }
 
@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
      * However, it seems to be a meaningful performance hit to try to search
      * them. So let's not. */
 
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
     /* save reps for next block */
-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
     match0 = base + idx;
     rep_offset2 = rep_offset1;
     rep_offset1 = (U32)(ip0-match0);
-    offcode = STORE_OFFSET(rep_offset1);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
     mLength = 4;
 
     /* Count the backwards match length. */
@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
     ip0 += mLength;
     anchor = ip0;
 
-    /* write next hash table entry */
-    if (ip1 < ip0) {
-        hashTable[hash1] = (U32)(ip1 - base);
-    }
-
     /* Fill table and check for immediate repcode. */
     if (ip0 <= ilimit) {
         /* Fill Table */
@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
                 ip0 += rLength;
-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 anchor = ip0;
                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
     }   }   }
@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
     const BYTE* anchor = istart;
     const U32   prefixStartIndex = ms->window.dictLimit;
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
-    const U32 dictHLog             = dictCParams->hashLog;
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
     const U32 maxDistance = 1U << cParams->windowLog;
-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
     assert(endIndex - prefixStartIndex <= maxDistance);
     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
 
@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      * when translating a dict index into a local index */
     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes)
+    }
+
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
-    ip += (dictAndPrefixLength == 0);
+    ip0 += (dictAndPrefixLength == 0);
     /* dictMatchState repCode checks don't currently handle repCode == 0
      * disabling. */
     assert(offset_1 <= dictAndPrefixLength);
     assert(offset_2 <= dictAndPrefixLength);
 
-    /* Main Search Loop */
-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
         size_t mLength;
-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
-        U32 const curr = (U32)(ip-base);
-        U32 const matchIndex = hashTable[h];
-        const BYTE* match = base + matchIndex;
-        const U32 repIndex = curr + 1 - offset_1;
-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
-                               dictBase + (repIndex - dictIndexDelta) :
-                               base + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-
-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
-        } else if ( (matchIndex <= prefixStartIndex) ) {
-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
-            U32 const dictMatchIndex = dictHashTable[dictHash];
-            const BYTE* dictMatch = dictBase + dictMatchIndex;
-            if (dictMatchIndex <= dictStartIndex ||
-                MEM_read32(dictMatch) != MEM_read32(ip)) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
-            } else {
-                /* found a dict match */
-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
-                while (((ip>anchor) & (dictMatch>dictStart))
-                     && (ip[-1] == dictMatch[-1])) {
-                    ip--; dictMatch--; mLength++;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
                 } /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
             }
-        } else if (MEM_read32(match) != MEM_read32(ip)) {
-            /* it's not a match, and we're not going to check the dictionary */
-            assert(stepSize >= 1);
-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-            continue;
-        } else {
-            /* found a regular match */
-            U32 const offset = (U32)(ip-match);
-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            while (((ip>anchor) & (match>prefixStart))
-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-            offset_2 = offset_1;
-            offset_1 = offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-        }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
 
         /* match found */
-        ip += mLength;
-        anchor = ip;
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
 
-        if (ip <= ilimit) {
+        if (ip0 <= ilimit) {
             /* Fill Table */
             assert(base+curr+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
 
             /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
                         dictBase - dictIndexDelta + repIndex2 :
                         base + repIndex2;
                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
                     continue;
                 }
                 break;
             }
         }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
     }
 
+_cleanup:
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     U32* const hashTable = ms->hashTable;
     U32 const hlog = cParams->hashLog;
     /* support stepSize of 0 */
-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
 
     (void)hasStep; /* not currently specialized on whether it's accelerated */
 
@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     if (prefixStartIndex == dictStartIndex)
         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
 
-    /* Search Loop */
-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
-        const U32    matchIndex = hashTable[h];
-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
-        const BYTE*  match = matchBase + matchIndex;
-        const U32    curr = (U32)(ip-base);
-        const U32    repIndex = curr + 1 - offset_1;
-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
-        const BYTE* const repMatch = repBase + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
-
-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
-            ip += rLength;
-            anchor = ip;
-        } else {
-            if ( (matchIndex < dictStartIndex) ||
-                 (MEM_read32(match) != MEM_read32(ip)) ) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
             }
-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
-                U32 const offset = curr - matchIndex;
-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-                ip += mLength;
-                anchor = ip;
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
         }   }
 
-        if (ip <= ilimit) {
-            /* Fill Table */
-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
-            /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
-                U32 const repIndex2 = current2 - offset_2;
-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
-                    continue;
-                }
-                break;
-    }   }   }
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
 
     /* save reps for next block */
-    rep[0] = offset_1;
-    rep[1] = offset_2;
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
 }
 
 ZSTD_GEN_FAST_FN(extDict, 4, 0)
@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
         void const* src, size_t srcSize)
 {
     U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
index fddc2f532d21..e64d9e1b2d39 100644
--- a/lib/zstd/compress/zstd_fast.h
+++ b/lib/zstd/compress/zstd_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
index 0298a01a7504..f6b4978ceba7 100644
--- a/lib/zstd/compress/zstd_lazy.c
+++ b/lib/zstd/compress/zstd_lazy.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +11,9 @@
 
 #include "zstd_compress_internal.h"
 #include "zstd_lazy.h"
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+
+#define kLazySkippingStep 8
 
 
 /*-*************************************
@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
             U32 matchIndex = dictMatchIndex + dictIndexDelta;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             }
             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
     }
 
     if (bestLength >= MINMATCH) {
-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
     }
@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
 static size_t
 ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iend,
-                        size_t* offsetPtr,
+                        size_t* offBasePtr,
                         U32 const mls,
                         const ZSTD_dictMode_e dictMode)
 {
@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
             if (matchLength > bestLength) {
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
+                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
                     if (dictMode == ZSTD_dictMatchState) {
                         nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
         if (dictMode == ZSTD_dictMatchState && nbCompares) {
             bestLength = ZSTD_DUBT_findBetterDictMatch(
                     ms, ip, iend,
-                    offsetPtr, bestLength, nbCompares,
+                    offBasePtr, bestLength, nbCompares,
                     mls, dictMode);
         }
 
         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
         if (bestLength >= MINMATCH) {
-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
         }
         return bestLength;
     }
@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iLimit,
-                      size_t* offsetPtr,
+                      size_t* offBasePtr,
                 const U32 mls /* template */,
                 const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
     ZSTD_updateDUBT(ms, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
 }
 
 /* *********************************
@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
             if (ip+currentMl == iLimit) {
                 /* best possible, avoids read overflow on next attempt */
                 return ml;
@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
             /* save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
 FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
                         ZSTD_matchState_t* ms,
                         const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
+                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
 {
     U32* const hashTable  = ms->hashTable;
     const U32 hashLog = cParams->hashLog;
@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
         hashTable[h] = idx;
         idx++;
+        /* Stop inserting every position when in the lazy skipping mode. */
+        if (lazySkipping)
+            break;
     }
 
     ms->nextToUpdate = target;
@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
 
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
 }
 
 /* inlining is important to hardwire a hot branch (template emulation) */
@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
     }
 
     /* HC4 match finder */
-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
 
     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-            if (match[ml] == ip[ml])   /* potentially better */
+            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
         } else {
             const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
         }
 
@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
             if (currentMl > ml) {
                 ml = currentMl;
                 assert(curr > matchIndex + dmsIndexDelta);
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
 
@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
 * (SIMD) Row-based matchfinder
 ***********************************/
 /* Constants for row-based hash */
-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
 #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
 
@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
  * Starting from the LSB, returns the idx of the next non-zero bit.
  * Basically counting the nb of trailing zeroes.
  */
-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
-    assert(val != 0);
-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
-    if (sizeof(size_t) == 4) {
-        U32 mostSignificantWord = (U32)(val >> 32);
-        U32 leastSignificantWord = (U32)val;
-        if (leastSignificantWord == 0) {
-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
-        } else {
-            return (U32)__builtin_ctz(leastSignificantWord);
-        }
-    } else {
-        return (U32)__builtin_ctzll(val);
-    }
-#   else
-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
-     */
-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
-    val = val - ((val >> 1) & 0x5555555555555555);
-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
-#   endif
-}
-
-/* ZSTD_rotateRight_*():
- * Rotates a bitfield to the right by "count" bits.
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
- */
-FORCE_INLINE_TEMPLATE
-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
-    assert(count < 64);
-    count &= 0x3F; /* for fickle pattern recognition */
-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
-}
-
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
-    assert(count < 32);
-    count &= 0x1F; /* for fickle pattern recognition */
-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
-}
-
-FORCE_INLINE_TEMPLATE
-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
-    assert(count < 16);
-    count &= 0x0F; /* for fickle pattern recognition */
-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    return ZSTD_countTrailingZeros64(val);
 }
 
 /* ZSTD_row_nextIndex():
  * Returns the next index to insert at within a tagTable row, and updates the "head"
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
-  U32 const next = (*tagRow - 1) & rowMask;
-  *tagRow = (BYTE)next;
-  return next;
+    U32 next = (*tagRow-1) & rowMask;
+    next += (next == 0) ? rowMask : 0; /* skip first position */
+    *tagRow = (BYTE)next;
+    return next;
 }
 
 /* ZSTD_isAligned():
@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
 /* ZSTD_row_prefetch():
  * Performs prefetching for the hashTable and tagTable at a given row.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
     PREFETCH_L1(hashTable + relRow);
     if (rowLog >= 5) {
         PREFETCH_L1(hashTable + relRow + 16);
@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
                                    U32 idx, const BYTE* const iLimit)
 {
     U32 const* const hashTable = ms->hashTable;
-    U16 const* const tagTable = ms->tagTable;
+    BYTE const* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
 
     for (; idx < lim; ++idx) {
-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
-                                                  U16 const* tagTable, BYTE const* base,
+                                                  BYTE const* tagTable, BYTE const* base,
                                                   U32 idx, U32 const hashLog,
-                                                  U32 const rowLog, U32 const mls)
+                                                  U32 const rowLog, U32 const mls,
+                                                  U64 const hashSalt)
 {
-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
                                                         U32 const rowMask, U32 const useCache)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     const BYTE* const base = ms->window.base;
 
     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
+                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32* const row = hashTable + relRow;
-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
-                                                       Explicit cast allows us to get exact desired position within each row */
+        BYTE* tagRow = tagTable + relRow;
         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
 
-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
+        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
         row[pos] = updateStartIdx;
     }
 }
@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
 
     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
+}
+
+/* Returns the mask width of bits group of which will be set to 1. Given not all
+ * architectures have easy movemask instruction, this helps to iterate over
+ * groups of bits easier and faster.
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    (void)rowEntries;
+#if defined(ZSTD_ARCH_ARM_NEON)
+    /* NEON path only works for little endian */
+    if (!MEM_isLittleEndian()) {
+        return 1;
+    }
+    if (rowEntries == 16) {
+        return 4;
+    }
+    if (rowEntries == 32) {
+        return 2;
+    }
+    if (rowEntries == 64) {
+        return 1;
+    }
+#endif
+    return 1;
 }
 
 #if defined(ZSTD_ARCH_X86_SSE2)
@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
 }
 #endif
 
-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
- * the hash at the nth position in a row of the tagTable.
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
- * to match up with the actual layout of the entries within the hashTable */
+#if defined(ZSTD_ARCH_ARM_NEON)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    if (rowEntries == 16) {
+        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
+         * After that groups of 4 bits represent the equalMask. We lower
+         * all bits except the highest in these groups by doing AND with
+         * 0x88 = 0b10001000.
+         */
+        const uint8x16_t chunk = vld1q_u8(src);
+        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
+    } else if (rowEntries == 32) {
+        /* Same idea as with rowEntries == 16 but doing AND with
+         * 0x55 = 0b01010101.
+         */
+        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
+        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
+        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
+        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
+    } else { /* rowEntries == 64 */
+        const uint8x16x4_t chunk = vld4q_u8(src);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped);
+    }
+}
+#endif
+
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
+ * matches the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
+ * must rotate the "matches" bitfield to match up with the actual layout of the
+ * entries within the hashTable */
 FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
 {
-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+    const BYTE* const src = tagRow;
     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
 
 #if defined(ZSTD_ARCH_X86_SSE2)
 
-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
 
 #else /* SW or NEON-LE */
 
 # if defined(ZSTD_ARCH_ARM_NEON)
   /* This NEON path only works for little endian - otherwise use SWAR below */
     if (MEM_isLittleEndian()) {
-        if (rowEntries == 16) {
-            const uint8x16_t chunk = vld1q_u8(src);
-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
-        } else if (rowEntries == 32) {
-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
-            return ZSTD_rotateRight_U32(matches, head);
-        } else { /* rowEntries == 64 */
-            const uint8x16x4_t chunk = vld4q_u8(src);
-            const uint8x16_t dup = vdupq_n_u8(tag);
-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
-
-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
-            return ZSTD_rotateRight_U64(matches, head);
-        }
+        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
     }
 # endif /* ZSTD_ARCH_ARM_NEON */
     /* SWAR */
-    {   const size_t chunkSize = sizeof(size_t);
+    {   const int chunkSize = sizeof(size_t);
         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
         const size_t xFF = ~((size_t)0);
         const size_t x01 = xFF / 0xFF;
@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
         }
         matches = ~matches;
         if (rowEntries == 16) {
-            return ZSTD_rotateRight_U16((U16)matches, head);
+            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
         } else if (rowEntries == 32) {
-            return ZSTD_rotateRight_U32((U32)matches, head);
+            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
         } else {
-            return ZSTD_rotateRight_U64((U64)matches, head);
+            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
         }
     }
 #endif
@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
                         const U32 rowLog)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32* const hashCache = ms->hashCache;
     const U32 hashLog = ms->rowHashLog;
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
     const U32 rowEntries = (1U << rowLog);
     const U32 rowMask = rowEntries - 1;
     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+    const U64 hashSalt = ms->hashSalt;
     U32 nbAttempts = 1U << cappedSearchLog;
     size_t ml=4-1;
+    U32 hash;
 
     /* DMS/DDS variables that may be referenced laster */
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
     if (dictMode == ZSTD_dictMatchState) {
         /* Prefetch DMS rows */
         U32* const dmsHashTable = dms->hashTable;
-        U16* const dmsTagTable = dms->tagTable;
+        BYTE* const dmsTagTable = dms->tagTable;
         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
     }
 
     /* Update the hashTable and tagTable up to (but not including) ip */
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    if (!ms->lazySkipping) {
+        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
+    } else {
+        /* Stop inserting every position when in the lazy skipping mode.
+         * The hash cache is also not kept up to date in this mode.
+         */
+        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+        ms->nextToUpdate = curr;
+    }
+    ms->hashSaltEntropy += hash; /* collect salt entropy */
+
     {   /* Get the hash for ip, compute the appropriate row */
-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
         U32* const row = hashTable + relRow;
         BYTE* tagRow = (BYTE*)(tagTable + relRow);
-        U32 const head = *tagRow & rowMask;
+        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
         size_t numMatches = 0;
         size_t currMatch = 0;
-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
 
         /* Cycle through the matches and prefetch */
-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
             U32 const matchIndex = row[matchPos];
+            if(matchPos == 0) continue;
             assert(numMatches < rowEntries);
             if (matchIndex < lowLimit)
                 break;
@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
                 PREFETCH_L1(dictBase + matchIndex);
             }
             matchBuffer[numMatches++] = matchIndex;
+            --nbAttempts;
         }
 
         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
            in ZSTD_row_update_internal() at the next search. */
         {
             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+            tagRow[pos] = (BYTE)tag;
             row[pos] = ms->nextToUpdate++;
         }
 
@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                 const BYTE* const match = base + matchIndex;
                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-                if (match[ml] == ip[ml])   /* potentially better */
+                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                     currentMl = ZSTD_count(ip, match, iLimit);
             } else {
                 const BYTE* const match = dictBase + matchIndex;
@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
             /* Save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
         const U32 dmsIndexDelta        = dictLimit - dmsSize;
 
-        {   U32 const head = *dmsTagRow & rowMask;
+        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
             size_t numMatches = 0;
             size_t currMatch = 0;
-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
 
-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
                 U32 const matchIndex = dmsRow[matchPos];
+                if(matchPos == 0) continue;
                 if (matchIndex < dmsLowestIndex)
                     break;
                 PREFETCH_L1(dmsBase + matchIndex);
                 matchBuffer[numMatches++] = matchIndex;
+                --nbAttempts;
             }
 
             /* Return the longest match */
@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
                 if (currentMl > ml) {
                     ml = currentMl;
                     assert(curr > matchIndex + dmsIndexDelta);
-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                     if (ip+currentMl == iLimit) break;
                 }
             }
@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
 
-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     const int isDMS = dictMode == ZSTD_dictMatchState;
     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
         U32 const curr = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
         U32 const maxRep = curr - windowLow;
-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
     if (isDxS) {
         /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <= dictAndPrefixLength);
     }
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                            ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         DEBUGLOG(7, "search baseline (depth 0)");
 
@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
+        {   size_t offbaseFound = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = offbaseFound;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
+            ip += step;
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
             DEBUGLOG(7, "search depth 1");
             ip ++;
             if ( (dictMode == ZSTD_noDict)
-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                 if ((mlRep >= 4) && (gain2 > gain1))
-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }
             if (isDxS) {
                 const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                     int const gain2 = (int)(mlRep * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
             }
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate=999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
                 DEBUGLOG(7, "search depth 2");
                 ip ++;
                 if ( (dictMode == ZSTD_noDict)
-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                     int const gain2 = (int)(mlRep * 4);
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
                 if (isDxS) {
                     const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                         int const gain2 = (int)(mlRep * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((mlRep >= 4) && (gain2 > gain1))
-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                     }
                 }
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate=999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
          * notably if `value` is unsigned, resulting in a large positive `-value`.
          */
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
+        if (OFFBASE_IS_OFFSET(offBase)) {
             if (dictMode == ZSTD_noDict) {
-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
+                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
+                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
                     { start--; matchLength++; }
             }
             if (isDxS) {
-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
             }
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         if (isDxS) {
@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                     ip += matchLength;
                     anchor = ip;
                     continue;
@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                 /* store sequence */
                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
     }   }   }
 
-    /* Save reps for next block */
-    rep[0] = offset_1 ? offset_1 : savedOffset;
-    rep[1] = offset_2 ? offset_2 : savedOffset;
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     /* init */
     ip += (ip == prefixStart);
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                               ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         U32 curr = (U32)(ip-base);
 
@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
+        {   size_t ofbCandidate = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = ofbCandidate;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
+            ip += step + 1;   /* jump faster over incompressible sections */
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             ip ++;
             curr++;
             /* check repCode */
-            if (offcode) {
+            if (offBase) {
                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                 const U32 repIndex = (U32)(curr - offset_1);
                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                     int const gain2 = (int)(repLength * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((repLength >= 4) && (gain2 > gain1))
-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate = 999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 ip ++;
                 curr++;
                 /* check repCode */
-                if (offcode) {
+                if (offBase) {
                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                     const U32 repIndex = (U32)(curr - offset_1);
                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                         int const gain2 = (int)(repLength * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((repLength >= 4) && (gain2 > gain1))
-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate = 999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
         }
 
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
 
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         while (ip <= ilimit) {
@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
 size_t ZSTD_compressBlock_lazy2_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
-
 {
     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
 }
diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
index e5bdf4df8dde..9505bed93c03 100644
--- a/lib/zstd/compress/zstd_lazy.h
+++ b/lib/zstd/compress/zstd_lazy.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -22,6 +23,8 @@
  */
 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
 
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
 
@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-        
+
 
 
 #endif /* ZSTD_LAZY_H */
diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
index dd86fc83e7dd..b7da76b0db7c 100644
--- a/lib/zstd/compress/zstd_ldm.c
+++ b/lib/zstd/compress/zstd_ldm.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
     switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_greedy:
@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
          * the window through early invalidation.
          * TODO: * Test the chunk size.
          *       * Try invalidation after the sequence generation and test the
-         *         the offset against maxDist directly.
+         *         offset against maxDist directly.
          *
          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
          * that any offset used is valid at the END of the sequence, since it may
@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             rep[0] = sequence.offset;
             /* Store the sequence */
             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
-                          STORE_OFFSET(sequence.offset),
+                          OFFSET_TO_OFFBASE(sequence.offset),
                           sequence.matchLength);
             ip += sequence.matchLength;
         }
diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
index fbc6a5e88fd7..c540731abde7 100644
--- a/lib/zstd/compress/zstd_ldm.h
+++ b/lib/zstd/compress/zstd_ldm.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
index 647f865be290..cfccfc46f6f7 100644
--- a/lib/zstd/compress/zstd_ldm_geartab.h
+++ b/lib/zstd/compress/zstd_ldm_geartab.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
index fd82acfda62f..1e41cb04f482 100644
--- a/lib/zstd/compress/zstd_opt.c
+++ b/lib/zstd/compress/zstd_opt.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,7 @@
 #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
 #define ZSTD_MAX_PRICE     (1<<30)
 
-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
 
 
 /*-*************************************
@@ -26,27 +27,35 @@
 #if 0    /* approximation at bit level (for tests) */
 #  define BITCOST_ACCURACY 0
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
 #elif 0  /* fractional bit accuracy (for tests) */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
 #else    /* opt==approx, ultra==accurate */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
 #endif
 
+/* ZSTD_bitWeight() :
+ * provide estimated "cost" of a stat in full bits only */
 MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
 {
     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
 }
 
+/* ZSTD_fracWeight() :
+ * provide fractional-bit "cost" of a stat,
+ * using linear interpolation approximation */
 MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 {
     U32 const stat = rawStat + 1;
     U32 const hb = ZSTD_highbit32(stat);
     U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    /* Fweight was meant for "Fractional weight"
+     * but it's effectively a value between 1 and 2
+     * using fixed point arithmetic */
     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
     U32 const weight = BWeight + FWeight;
     assert(hb + BITCOST_ACCURACY < 31);
@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 /* debugging function,
  * @return price in bytes as fractional value
  * for debug messages only */
-MEM_STATIC double ZSTD_fCost(U32 price)
+MEM_STATIC double ZSTD_fCost(int price)
 {
     return (double)price / (BITCOST_MULTIPLIER*8);
 }
@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
     return total;
 }
 
-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
+
+static U32
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
 {
     U32 s, sum=0;
-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
+            (unsigned)lastEltIndex+1, (unsigned)shift );
     assert(shift < 30);
     for (s=0; s<lastEltIndex+1; s++) {
-        table[s] = 1 + (table[s] >> shift);
-        sum += table[s];
+        unsigned const base = base1 ? 1 : (table[s]>0);
+        unsigned const newStat = base + (table[s] >> shift);
+        sum += newStat;
+        table[s] = newStat;
     }
     return sum;
 }
 
 /* ZSTD_scaleStats() :
- * reduce all elements in table is sum too large
+ * reduce all elt frequencies in table if sum too large
  * return the resulting sum of elements */
 static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
 {
@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
     assert(logTarget < 30);
     if (factor <= 1) return prevsum;
-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
 }
 
 /* ZSTD_rescaleFreqs() :
@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
     optPtr->priceType = zop_dynamic;
 
-    if (optPtr->litLengthSum == 0) {  /* first block : init */
-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
+
+        /* heuristic: use pre-defined stats for too small inputs */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
+            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
             optPtr->priceType = zop_predef;
         }
 
         assert(optPtr->symbolCosts != NULL);
         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
-            /* huffman table presumed generated by dictionary */
+
+            /* huffman stats covering the full value set : table presumed generated by dictionary */
             optPtr->priceType = zop_dynamic;
 
             if (compressedLiterals) {
+                /* generate literals statistics from huffman table */
                 unsigned lit;
                 assert(optPtr->litFreq != NULL);
                 optPtr->litSum = 0;
@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
             }   }
 
-        } else {  /* not a dictionary */
+        } else {  /* first block, no dictionary */
 
             assert(optPtr->litFreq != NULL);
             if (compressedLiterals) {
+                /* base initial cost of literals on direct frequency within src */
                 unsigned lit = MaxLit;
                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
+                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
             }
 
             {   unsigned const baseLLfreqs[MaxLL+1] = {
@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
             }
 
-
         }
 
-    } else {   /* new block : re-use previous statistics, scaled down */
+    } else {   /* new block : scale down accumulated statistics */
 
         if (compressedLiterals)
             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
 
     /* dynamic statistics */
-    {   U32 price = litLength * optPtr->litSumBasePrice;
+    {   U32 price = optPtr->litSumBasePrice * litLength;
+        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
         U32 u;
+        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
         for (u=0; u < litLength; u++) {
-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+            price -= litPrice;
         }
         return price;
     }
@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
     if (optPtr->priceType == zop_predef)
         return WEIGHT(litLength, optLevel);
-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
-     * because it isn't representable in the zstd format. So instead just
-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
-     * would be all literals.
+
+    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+     * because it isn't representable in the zstd format.
+     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
+     * In such a case, the block would be all literals.
      */
     if (litLength == ZSTD_BLOCKSIZE_MAX)
         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
 }
 
 /* ZSTD_getMatchPrice() :
- * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
  */
 FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offcode,
+ZSTD_getMatchPrice(U32 const offBase,
                    U32 const matchLength,
              const optState_t* const optPtr,
                    int const optLevel)
 {
     U32 price;
-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
+    U32 const offCode = ZSTD_highbit32(offBase);
     U32 const mlBase = matchLength - MINMATCH;
     assert(matchLength >= MINMATCH);
 
-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
+        return WEIGHT(mlBase, optLevel)
+             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
 
     /* dynamic statistics */
     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
 }
 
 /* ZSTD_updateStats() :
- * assumption : literals + litLengtn <= iend */
+ * assumption : literals + litLength <= iend */
 static void ZSTD_updateStats(optState_t* const optPtr,
                              U32 litLength, const BYTE* literals,
-                             U32 offsetCode, U32 matchLength)
+                             U32 offBase, U32 matchLength)
 {
     /* literals */
     if (ZSTD_compressedLiterals(optPtr)) {
@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
         optPtr->litLengthSum++;
     }
 
-    /* offset code : expected to follow storeSeq() numeric representation */
-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
+    /* offset code : follows storeSeq() numeric representation */
+    {   U32 const offCode = ZSTD_highbit32(offBase);
         assert(offCode <= MaxOff);
         optPtr->offCodeFreq[offCode]++;
         optPtr->offCodeSum++;
@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
 }
 
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
-                    ZSTD_matchState_t* ms,
-                    U32* nextToUpdate3,
-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
-                    const U32 rep[ZSTD_REP_NUM],
-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
-                    const U32 lengthToBeat,
-                    U32 const mls /* template */)
+FORCE_INLINE_TEMPLATE U32
+ZSTD_insertBtAndGetAllMatches (
+                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
+                ZSTD_matchState_t* ms,
+                U32* nextToUpdate3,
+                const BYTE* const ip, const BYTE* const iLimit,
+                const ZSTD_dictMode_e dictMode,
+                const U32 rep[ZSTD_REP_NUM],
+                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                const U32 lengthToBeat,
+                const U32 mls /* template */)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
                             repCode, ll0, repOffset, repLen);
                 bestLength = repLen;
-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
                 matches[mnum].len = (U32)repLen;
                 mnum++;
                 if ( (repLen > sufficient_len)
@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 bestLength = mlen;
                 assert(curr > matchIndex3);
                 assert(mnum==0);  /* no prior solution */
-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
+                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
                 matches[0].len = (U32)mlen;
                 mnum = 1;
                 if ( (mlen > sufficient_len) |
@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
         }
 
         if (matchLength > bestLength) {
-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
             bestLength = matchLength;
-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
             matches[mnum].len = (U32)matchLength;
             mnum++;
             if ( (matchLength > ZSTD_OPT_NUM)
@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
             if (matchLength > bestLength) {
                 matchIndex = dictMatchIndex + dmsIndexDelta;
-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
                 bestLength = matchLength;
-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
                 matches[mnum].len = (U32)matchLength;
                 mnum++;
                 if ( (matchLength > ZSTD_OPT_NUM)
@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
 {
     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
+    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
 
     /* Ensure that current block position is not outside of the match */
@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
     }
 
     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
-                 candidateOffCode, candidateMatchLength, currPosInBlock);
+        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+                 candidateOffBase, candidateMatchLength, currPosInBlock);
         matches[*nbMatches].len = candidateMatchLength;
-        matches[*nbMatches].off = candidateOffCode;
+        matches[*nbMatches].off = candidateOffBase;
         (*nbMatches)++;
     }
 }
@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     ZSTD_optimal_t lastSequence;
     ZSTD_optLdm_t optLdm;
 
+    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
+
     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
 
             /* large match -> immediate encoding */
             {   U32 const maxML = matches[nbMatches-1].len;
-                U32 const maxOffcode = matches[nbMatches-1].off;
-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
+                U32 const maxOffBase = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
 
                 if (maxML > sufficient_len) {
                     lastSequence.litlen = litlen;
                     lastSequence.mlen = maxML;
-                    lastSequence.off = maxOffcode;
+                    lastSequence.off = maxOffBase;
                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
                                 maxML, sufficient_len);
                     cur = 0;
@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
                 }
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
-                    U32 const offcode = matches[matchNb].off;
+                    U32 const offBase = matches[matchNb].off;
                     U32 const end = matches[matchNb].len;
                     for ( ; pos <= end ; pos++ ) {
-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
                         U32 const sequencePrice = literalsPrice + matchPrice;
                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
-                                    pos, ZSTD_fCost(sequencePrice));
+                                    pos, ZSTD_fCost((int)sequencePrice));
                         opt[pos].mlen = pos;
-                        opt[pos].off = offcode;
+                        opt[pos].off = offBase;
                         opt[pos].litlen = litlen;
                         opt[pos].price = (int)sequencePrice;
                 }   }
@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                     U32 mlen;
 
-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
                                 matchNb, matches[matchNb].off, lastML, litlen);
 
                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                     U32 const llen = opt[storePos].litlen;
                     U32 const mlen = opt[storePos].mlen;
-                    U32 const offCode = opt[storePos].off;
+                    U32 const offBase = opt[storePos].off;
                     U32 const advance = llen + mlen;
                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     }
 
                     assert(anchor + llen <= iend);
-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
                     anchor += advance;
                     ip = anchor;
             }   }
@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
 /* ZSTD_initStats_ultra():
  * make a first compression pass, just to seed stats with more accurate starting values.
  * only works on first block, with no dictionary and no ldm.
- * this function cannot error, hence its contract must be respected.
+ * this function cannot error out, its narrow contract must be respected.
  */
 static void
 ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
 
     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
 
-    /* invalidate first scan from history */
+    /* invalidate first scan from history, only keep entropy stats */
     ZSTD_resetSeqStore(seqStore);
     ms->window.base -= srcSize;
     ms->window.dictLimit += (U32)srcSize;
@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
 
-    /* 2-pass strategy:
+    /* 2-passes strategy:
      * this strategy makes a first pass over first block to collect statistics
-     * and seed next round's statistics with it.
-     * After 1st pass, function forgets everything, and starts a new block.
+     * in order to seed next round's statistics with it.
+     * After 1st pass, function forgets history, and starts a new block.
      * Consequently, this can only work if no data has been previously loaded in tables,
      * aka, no dictionary, no prefix, no ldm preprocessing.
      * The compression ratio gain is generally small (~0.5% on first block),
-     * the cost is 2x cpu time on first block. */
+    ** the cost is 2x cpu time on first block. */
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     if ( (ms->opt.litLengthSum==0)   /* first block */
       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
+      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
       ) {
         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
     }
diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
index 22b862858ba7..faa73ff4b03d 100644
--- a/lib/zstd/compress/zstd_opt.h
+++ b/lib/zstd/compress/zstd_opt.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
index 60958afebc41..d172e35fbd9a 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * huff0 huffman decoder,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +20,10 @@
 #include "../common/compiler.h"
 #include "../common/bitstream.h"  /* BIT_* */
 #include "../common/fse.h"        /* to compress headers */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
 #include "../common/zstd_internal.h"
+#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
 
 /* **************************************************************
 *  Constants
@@ -43,27 +44,25 @@
 #error "Cannot force the use of the X1 and X2 decoders at the same time!"
 #endif
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
 #else
-# define HUF_ASM_X86_64_BMI2_ATTRS
+# define HUF_FAST_BMI2_ATTRS
 #endif
 
 #define HUF_EXTERN_C
 #define HUF_ASM_DECL HUF_EXTERN_C
 
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+#if DYNAMIC_BMI2
 # define HUF_NEED_BMI2_FUNCTION 1
 #else
 # define HUF_NEED_BMI2_FUNCTION 0
 #endif
 
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
-# define HUF_NEED_DEFAULT_FUNCTION 1
-#else
-# define HUF_NEED_DEFAULT_FUNCTION 0
-#endif
-
 /* **************************************************************
 *  Error Management
 ****************************************************************/
@@ -80,6 +79,11 @@
 /* **************************************************************
 *  BMI2 Variant Wrappers
 ****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+                                              const void *cSrc,
+                                              size_t cSrcSize,
+                                              const HUF_DTable *DTable);
+
 #if DYNAMIC_BMI2
 
 #define HUF_DGEN(fn)                                                        \
@@ -101,9 +105,9 @@
     }                                                                       \
                                                                             \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        if (bmi2) {                                                         \
+        if (flags & HUF_flags_bmi2) {                                       \
             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
         }                                                                   \
         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
@@ -113,9 +117,9 @@
 
 #define HUF_DGEN(fn)                                                        \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        (void)bmi2;                                                         \
+        (void)flags;                                                        \
         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
     }
 
@@ -134,15 +138,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
     return dtd;
 }
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-static size_t HUF_initDStream(BYTE const* ip) {
+static size_t HUF_initFastDStream(BYTE const* ip) {
     BYTE const lastByte = ip[7];
-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
     size_t const value = MEM_readLEST(ip) | 1;
     assert(bitsConsumed <= 8);
+    assert(sizeof(size_t) == 8);
     return value << bitsConsumed;
 }
+
+
+/*
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ *             as long as it is above ilimit, but that indicates corruption.
+ */
 typedef struct {
     BYTE const* ip[4];
     BYTE* op[4];
@@ -151,15 +168,17 @@ typedef struct {
     BYTE const* ilimit;
     BYTE* oend;
     BYTE const* iend[4];
-} HUF_DecompressAsmArgs;
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
 
 /*
- * Initializes args for the asm decoding loop.
- * @returns 0 on success
- *          1 if the fallback implementation should be used.
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ *          0 if the fallback implementation should be used.
  *          Or an error code on failure.
  */
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
 {
     void const* dt = DTable + 1;
     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
@@ -168,9 +187,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     BYTE* const oend = (BYTE*)dst + dstSize;
 
-    /* The following condition is false on x32 platform,
-     * but HUF_asm is not compatible with this ABI */
-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+    /* The fast decoding loop assumes 64-bit little-endian.
+     * This condition is false on x32.
+     */
+    if (!MEM_isLittleEndian() || MEM_32bits())
+        return 0;
 
     /* strict minimum : jump table + 1 byte per stream */
     if (srcSize < 10)
@@ -181,7 +202,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
      */
     if (dtLog != HUF_DECODER_FAST_TABLELOG)
-        return 1;
+        return 0;
 
     /* Read the jump table. */
     {
@@ -195,13 +216,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         args->iend[2] = args->iend[1] + length2;
         args->iend[3] = args->iend[2] + length3;
 
-        /* HUF_initDStream() requires this, and this small of an input
+        /* HUF_initFastDStream() requires this, and this small of an input
          * won't benefit from the ASM loop anyways.
          * length1 must be >= 16 so that ip[0] >= ilimit before the loop
          * starts.
          */
         if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
-            return 1;
+            return 0;
         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
     }
     /* ip[] contains the position that is currently loaded into bits[]. */
@@ -218,7 +239,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     /* No point to call the ASM loop for tiny outputs. */
     if (args->op[3] >= oend)
-        return 1;
+        return 0;
 
     /* bits[] is the bit container.
         * It is read from the MSB down to the LSB.
@@ -227,10 +248,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         * set, so that CountTrailingZeros(bits[]) can be used
         * to count how many bits we've consumed.
         */
-    args->bits[0] = HUF_initDStream(args->ip[0]);
-    args->bits[1] = HUF_initDStream(args->ip[1]);
-    args->bits[2] = HUF_initDStream(args->ip[2]);
-    args->bits[3] = HUF_initDStream(args->ip[3]);
+    args->bits[0] = HUF_initFastDStream(args->ip[0]);
+    args->bits[1] = HUF_initFastDStream(args->ip[1]);
+    args->bits[2] = HUF_initFastDStream(args->ip[2]);
+    args->bits[3] = HUF_initFastDStream(args->ip[3]);
 
     /* If ip[] >= ilimit, it is guaranteed to be safe to
         * reload bits[]. It may be beyond its section, but is
@@ -241,10 +262,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
     args->oend = oend;
     args->dt = dt;
 
-    return 0;
+    return 1;
 }
 
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
 {
     /* Validate that we haven't overwritten. */
     if (args->op[stream] > segmentEnd)
@@ -258,15 +279,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
         return ERROR(corruption_detected);
 
     /* Construct the BIT_DStream_t. */
-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+    assert(sizeof(size_t) == 8);
+    bit->bitContainer = MEM_readLEST(args->ip[stream]);
+    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
     bit->start = (const char*)args->iend[0];
     bit->limitPtr = bit->start + sizeof(size_t);
     bit->ptr = (const char*)args->ip[stream];
 
     return 0;
 }
-#endif
 
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
@@ -283,10 +304,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
     U64 D4;
     if (MEM_isLittleEndian()) {
-        D4 = (symbol << 8) + nbBits;
+        D4 = (U64)((symbol << 8) + nbBits);
     } else {
-        D4 = symbol + (nbBits << 8);
+        D4 = (U64)(symbol + (nbBits << 8));
     }
+    assert(D4 < (1U << 16));
     D4 *= 0x0001000100010001ULL;
     return D4;
 }
@@ -329,13 +351,7 @@ typedef struct {
         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
 } HUF_ReadDTableX1_Workspace;
 
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog = 0;
     U32 nbSymbols = 0;
@@ -350,7 +366,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
 
@@ -377,9 +393,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * rankStart[0] is not filled because there are no entries in the table for
      * weight 0.
      */
-    {
-        int n;
-        int nextRankStart = 0;
+    {   int n;
+        U32 nextRankStart = 0;
         int const unroll = 4;
         int const nLimit = (int)nbSymbols - unroll + 1;
         for (n=0; n<(int)tableLog+1; n++) {
@@ -406,10 +421,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * We can switch based on the length to a different inner loop which is
      * optimized for that particular case.
      */
-    {
-        U32 w;
-        int symbol=wksp->rankVal[0];
-        int rankStart=0;
+    {   U32 w;
+        int symbol = wksp->rankVal[0];
+        int rankStart = 0;
         for (w=1; w<tableLog+1; ++w) {
             int const symbolCount = wksp->rankVal[w];
             int const length = (1 << w) >> 1;
@@ -519,7 +533,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
     while (p < pEnd)
         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
-    return pEnd-pStart;
+    return (size_t)(pEnd-pStart);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -545,6 +559,10 @@ HUF_decompress1X1_usingDTable_internal_body(
     return dstSize;
 }
 
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -588,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
 
         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -650,38 +669,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U16 const* const dtable = (U16 const*)args->dt;
+    BYTE* const oend = args->oend;
+    BYTE const* const ilimit = args->ilimit;
+
+    /* Copy the arguments to local variables */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+        int symbol;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+            assert(ip[stream] >= ilimit);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each iteration produces 5 output symbols per stream */
+            size_t const oiters = (size_t)(oend - op[3]) / 5;
+            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+             * per stream.
+             */
+            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
+            /* We can safely run iters iterations before running bounds checks */
+            size_t const iters = MIN(oiters, iiters);
+            size_t const symbols = iters * 5;
+
+            /* We can simply check that op[3] < olimit, instead of checking all
+             * of our bounds, since we can't hit the other bounds until we've run
+             * iters iterations, which only happens when op[3] == olimit.
+             */
+            olimit = op[3] + symbols;
+
+            /* Exit fast decoding loop once we get close to the end. */
+            if (op[3] + 20 > olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+        do {
+            /* Decode 5 symbols in each of the 4 streams */
+            for (symbol = 0; symbol < 5; ++symbol) {
+                for (stream = 0; stream < 4; ++stream) {
+                    int const index = (int)(bits[stream] >> 53);
+                    int const entry = (int)dtable[index];
+                    bits[stream] <<= (entry & 63);
+                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
+                }
+            }
+            /* Reload the bitstreams */
+            for (stream = 0; stream < 4; ++stream) {
+                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+                int const nbBits = ctz & 7;
+                int const nbBytes = ctz >> 3;
+                op[stream] += 5;
+                ip[stream] -= nbBytes;
+                bits[stream] = MEM_read64(ip[stream]) | 1;
+                bits[stream] <<= nbBits;
+            }
+        } while (op[3] < olimit);
+    }
+
+_out:
 
-static HUF_ASM_X86_64_BMI2_ATTRS
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/*
+ * @returns @p dstSize on success (>= 6)
+ *          0 if the fallback implementation should be used
+ *          An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
 size_t
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+HUF_decompress4X1_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn)
 {
     void const* dt = DTable + 1;
     const BYTE* const iend = (const BYTE*)cSrc + 6;
     BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
-    {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+    HUF_DecompressFastArgs args;
+    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+        if (ret == 0)
+            return 0;
     }
 
     assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+    loopFn(&args);
 
     /* Our loop guarantees that ip[] >= ilimit and that we haven't
     * overwritten any op[].
@@ -694,8 +817,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     (void)iend;
 
     /* finish bit streams one by one. */
-    {
-        size_t const segmentSize = (dstSize+3) / 4;
+    {   size_t const segmentSize = (dstSize+3) / 4;
         BYTE* segmentEnd = (BYTE*)dst;
         int i;
         for (i = 0; i < 4; ++i) {
@@ -712,97 +834,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     }
 
     /* decoded size */
+    assert(dstSize != 0);
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
-
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
-                                               const void *cSrc,
-                                               size_t cSrcSize,
-                                               const HUF_DTable *DTable);
 
 HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
 
 static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+    }
 #endif
-}
-
-
-size_t HUF_decompress1X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
 
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    const BYTE* ip = (const BYTE*) cSrc;
-
-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
-    if (HUF_isError(hSize)) return hSize;
-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
-    ip += hSize; cSrcSize -= hSize;
-
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    if (!(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X2 */
 
 
@@ -985,7 +1069,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
 
 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList,
-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
+                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
                            const U32 nbBitsBaseline)
 {
     U32* const rankVal = rankValOrigin[0];
@@ -1040,14 +1124,7 @@ typedef struct {
 
 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
                        const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
-                       const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize, int bmi2)
+                             void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog, maxW, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1069,7 +1146,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
     /* check result */
@@ -1240,6 +1317,11 @@ HUF_decompress1X2_usingDTable_internal_body(
     /* decoded size */
     return dstSize;
 }
+
+/* HUF_decompress4X2_usingDTable_internal_body():
+ * Conditions:
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X2_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -1280,8 +1362,9 @@ HUF_decompress4X2_usingDTable_internal_body(
         DTableDesc const dtd = HUF_getDTableDesc(DTable);
         U32 const dtLog = dtd.tableLog;
 
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1366,36 +1449,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
 
-static HUF_ASM_X86_64_BMI2_ATTRS size_t
-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    BYTE* oend[4];
+    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
+    BYTE const* const ilimit = args->ilimit;
+
+    /* Copy the arguments to local registers. */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    oend[0] = op[1];
+    oend[1] = op[2];
+    oend[2] = op[3];
+    oend[3] = args->oend;
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+        int symbol;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= oend[stream]);
+            assert(ip[stream] >= ilimit);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each loop does 5 table lookups for each of the 4 streams.
+             * Each table lookup consumes up to 11 bits of input, and produces
+             * up to 2 bytes of output.
+             */
+            /* We can consume up to 7 bytes of input per iteration per stream.
+             * We also know that each input pointer is >= ip[0]. So we can run
+             * iters loops before running out of input.
+             */
+            size_t iters = (size_t)(ip[0] - ilimit) / 7;
+            /* Each iteration can produce up to 10 bytes of output per stream.
+             * Each output stream my advance at different rates. So take the
+             * minimum number of safe iterations among all the output streams.
+             */
+            for (stream = 0; stream < 4; ++stream) {
+                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
+                iters = MIN(iters, oiters);
+            }
+
+            /* Each iteration produces at least 5 output symbols. So until
+             * op[3] crosses olimit, we know we haven't executed iters
+             * iterations yet. This saves us maintaining an iters counter,
+             * at the expense of computing the remaining # of iterations
+             * more frequently.
+             */
+            olimit = op[3] + (iters * 5);
+
+            /* Exit the fast decoding loop if we are too close to the end. */
+            if (op[3] + 10 > olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+        do {
+            /* Do 5 table lookups for each of the first 3 streams */
+            for (symbol = 0; symbol < 5; ++symbol) {
+                for (stream = 0; stream < 3; ++stream) {
+                    int const index = (int)(bits[stream] >> 53);
+                    HUF_DEltX2 const entry = dtable[index];
+                    MEM_write16(op[stream], entry.sequence);
+                    bits[stream] <<= (entry.nbBits);
+                    op[stream] += (entry.length);
+                }
+            }
+            /* Do 1 table lookup from the final stream */
+            {
+                int const index = (int)(bits[3] >> 53);
+                HUF_DEltX2 const entry = dtable[index];
+                MEM_write16(op[3], entry.sequence);
+                bits[3] <<= (entry.nbBits);
+                op[3] += (entry.length);
+            }
+            /* Do 4 table lookups from the final stream & reload bitstreams */
+            for (stream = 0; stream < 4; ++stream) {
+                /* Do a table lookup from the final stream.
+                 * This is interleaved with the reloading to reduce register
+                 * pressure. This shouldn't be necessary, but compilers can
+                 * struggle with codegen with high register pressure.
+                 */
+                {
+                    int const index = (int)(bits[3] >> 53);
+                    HUF_DEltX2 const entry = dtable[index];
+                    MEM_write16(op[3], entry.sequence);
+                    bits[3] <<= (entry.nbBits);
+                    op[3] += (entry.length);
+                }
+                /* Reload the bistreams. The final bitstream must be reloaded
+                 * after the 5th symbol was decoded.
+                 */
+                {
+                    int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+                    int const nbBits = ctz & 7;
+                    int const nbBytes = ctz >> 3;
+                    ip[stream] -= nbBytes;
+                    bits[stream] = MEM_read64(ip[stream]) | 1;
+                    bits[stream] <<= nbBits;
+                }
+            }
+        } while (op[3] < olimit);
+    }
+
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+
+static HUF_FAST_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn) {
     void const* dt = DTable + 1;
     const BYTE* const iend = (const BYTE*)cSrc + 6;
     BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
+    HUF_DecompressFastArgs args;
     {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
         FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (ret == 0)
+            return 0;
     }
 
     assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+    loopFn(&args);
 
     /* note : op4 already verified within main loop */
     assert(args.ip[0] >= iend);
@@ -1426,91 +1650,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
     /* decoded size */
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
 
 static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+    }
 #endif
+
+    if (!(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
 HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
 
-size_t HUF_decompress1X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
 size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
-                                               workSpace, wkspSize);
+                                               workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
 }
 
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
-                                         workSpace, wkspSize);
+                                         workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X1 */
 
 
@@ -1518,44 +1723,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
 /* Universal decompression selectors */
 /* ***********************************/
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
 
 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1610,36 +1777,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 #endif
 }
 
-
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
-                                     size_t dstSize, const void* cSrc,
-                                     size_t cSrcSize, void* workSpace,
-                                     size_t wkspSize)
-{
-    /* validation checks */
-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
-    if (cSrcSize == 0) return ERROR(corruption_detected);
-
-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-        (void)algoNb;
-        assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-        (void)algoNb;
-        assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                            cSrcSize, workSpace, wkspSize):
-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#endif
-    }
-}
-
 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                   const void* cSrc, size_t cSrcSize,
-                                  void* workSpace, size_t wkspSize)
+                                  void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1652,71 +1792,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
         (void)algoNb;
         assert(algoNb == 0);
         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #else
         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize):
+                                cSrcSize, workSpace, wkspSize, flags):
                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
 
 
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 #endif
 
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1726,15 +1866,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
 #if defined(HUF_FORCE_DECOMPRESS_X1)
         (void)algoNb;
         assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
-
diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
index dbbc7919de53..30ef65e1ab5c 100644
--- a/lib/zstd/decompress/zstd_ddict.c
+++ b/lib/zstd/decompress/zstd_ddict.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,12 +15,12 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/cpu.h"         /* bmi2 */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_decompress_internal.h"
 #include "zstd_ddict.h"
@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
         ZSTD_memcpy(internalBuffer, dict, dictSize);
     }
     ddict->dictSize = dictSize;
-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
 
     /* parse dictionary content */
     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
 unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
 {
     if (ddict==NULL) return 0;
-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+    return ddict->dictID;
 }
diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
index 8c1a79d666f8..de459a0dacd1 100644
--- a/lib/zstd/decompress/zstd_ddict.h
+++ b/lib/zstd/decompress/zstd_ddict.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
index 6b3177c94711..5e2a3ef03732 100644
--- a/lib/zstd/decompress/zstd_decompress.c
+++ b/lib/zstd/decompress/zstd_decompress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -52,17 +53,18 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
 #include "../common/zstd_internal.h"  /* blockProperties_t */
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 
 
@@ -72,11 +74,11 @@
  *************************************/
 
 #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
-                                                     * Currently, that means a 0.75 load factor.
-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
-                                                     * the load factor of the ddict hash set.
-                                                     */
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+                                                    * Currently, that means a 0.75 load factor.
+                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+                                                    * the load factor of the ddict hash set.
+                                                    */
 
 #define DDICT_HASHSET_TABLE_BASE_SIZE 64
 #define DDICT_HASHSET_RESIZE_FACTOR 2
@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
     dctx->outBufferMode = ZSTD_bm_buffered;
     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+    dctx->disableHufAsm = 0;
 }
 
 static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
+**           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
 {
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
-    if (srcSize < minInputSize) return minInputSize;
-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+
+    if (srcSize > 0) {
+        /* note : technically could be considered an assert(), since it's an invalid entry */
+        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+    }
+    if (srcSize < minInputSize) {
+        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+            /* when receiving less than @minInputSize bytes,
+             * control these bytes at least correspond to a supported magic number
+             * in order to error out early if they don't.
+            **/
+            size_t const toCopy = MIN(4, srcSize);
+            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+            assert(src != NULL);
+            ZSTD_memcpy(hbuf, src, toCopy);
+            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+                /* not a zstd frame : let's check if it's a skippable frame */
+                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+                ZSTD_memcpy(hbuf, src, toCopy);
+                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+                    RETURN_ERROR(prefix_unknown,
+                                "first bytes don't correspond to any supported magic number");
+        }   }   }
+        return minInputSize;
+    }
 
+    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
                     frameParameter_unsupported, "");
-    {
-        size_t const skippableSize = skippableHeaderSize + sizeU32;
+    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
         return skippableSize;
     }
 }
 
 /*! ZSTD_readSkippableFrame() :
- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ * Retrieves content of a skippable frame, and writes it to dst buffer.
  *
  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
  * in the magicVariant.
  *
- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
  *
  * @return : number of bytes written or a ZSTD error.
  */
-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
-                                            const void* src, size_t srcSize)
+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
+                               unsigned* magicVariant,  /* optional, can be NULL */
+                         const void* src, size_t srcSize)
 {
-    U32 const magicNumber = MEM_readLE32(src);
-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
-
-    /* check input validity */
-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
 
-    /* deliver payload */
-    if (skippableContentSize > 0  && dst != NULL)
-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
-    if (magicVariant != NULL)
-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
-    return skippableContentSize;
+    {   U32 const magicNumber = MEM_readLE32(src);
+        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+
+        /* check input validity */
+        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+
+        /* deliver payload */
+        if (skippableContentSize > 0  && dst != NULL)
+            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+        if (magicVariant != NULL)
+            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+        return skippableContentSize;
+    }
 }
 
 /* ZSTD_findDecompressedSize() :
- *  compatible with legacy mode
  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
  *      skippable frames
- *  @return : decompressed size of the frames contained */
+ *  note: compatible with legacy mode
+ * @return : decompressed size of the frames contained */
 unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 {
     unsigned long long totalDstSize = 0;
@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 
         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-            if (ZSTD_isError(skippableSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
             assert(skippableSize <= srcSize);
 
             src = (const BYTE *)src + skippableSize;
@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
             continue;
         }
 
-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
+            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
 
-            /* check for overflow */
-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
-            totalDstSize += ret;
+            if (totalDstSize + fcs < totalDstSize)
+                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
+            totalDstSize += fcs;
         }
+        /* skip to next frame */
         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
-            if (ZSTD_isError(frameSrcSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
+            assert(frameSrcSize <= srcSize);
 
             src = (const BYTE *)src + frameSrcSize;
             srcSize -= frameSrcSize;
@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
             ip += 4;
         }
 
+        frameSizeInfo.nbBlocks = nbBlocks;
         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
                                         ? zfh.frameContentSize
-                                        : nbBlocks * zfh.blockSizeMax;
+                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
         return frameSizeInfo;
     }
 }
@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
     return bound;
 }
 
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
+{
+    size_t margin = 0;
+    unsigned maxBlockSize = 0;
+
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        ZSTD_frameHeader zfh;
+
+        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ERROR(corruption_detected);
+
+        if (zfh.frameType == ZSTD_frame) {
+            /* Add the frame header to our margin */
+            margin += zfh.headerSize;
+            /* Add the checksum to our margin */
+            margin += zfh.checksumFlag ? 4 : 0;
+            /* Add 3 bytes per block */
+            margin += 3 * frameSizeInfo.nbBlocks;
+
+            /* Compute the max block size */
+            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
+        } else {
+            assert(zfh.frameType == ZSTD_skippableFrame);
+            /* Add the entire skippable frame size to our margin. */
+            margin += compressedSize;
+        }
+
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+    }
+
+    /* Add the max block size back to the margin. */
+    margin += maxBlockSize;
+
+    return margin;
+}
 
 /*-*************************************************************
  *   Frame decoding
@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
     }
     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
     /* Allow caller to get size read */
+    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
     *srcPtr = ip;
     *srcSizePtr = remainingSrcSize;
     return (size_t)(op-ostart);
@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
 
 
-        {   U32 const magicNumber = MEM_readLE32(src);
-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+        if (srcSize >= 4) {
+            U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                /* skippable frame detected : skip it */
                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
                 assert(skippableSize <= srcSize);
 
                 src = (const BYTE *)src + skippableSize;
                 srcSize -= skippableSize;
-                continue;
+                continue; /* check next frame */
         }   }
 
         if (ddict) {
@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
 size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
 
 /*
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
  * be streamed.
  *
  * For blocks that can be streamed, this allows us to reduce the latency until we produce
@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
 
     default:
         assert(0);   /* impossible */
-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }
 }
 
@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
         /* in minimal huffman, we always use X1 variants */
         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
                                                 dictPtr, dictEnd - dictPtr,
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #else
         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
                                                 dictPtr, (size_t)(dictEnd - dictPtr),
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #endif
         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
         dictPtr += hSize;
@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
     dctx->prefixStart = NULL;
     dctx->virtualStart = NULL;
     dctx->dictEnd = NULL;
-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
     dctx->litEntropy = dctx->fseEntropy = 0;
     dctx->dictID = 0;
     dctx->bType = bt_reserved;
@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary (most common case).
  *  - The frame was built with dictID intentionally removed.
- *    Needed dictionary is a hidden information.
+ *    Needed dictionary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
 unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
 {
-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
     if (ZSTD_isError(hError)) return 0;
     return zfp.dictID;
@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
 size_t ZSTD_initDStream(ZSTD_DStream* zds)
 {
     DEBUGLOG(4, "ZSTD_initDStream");
-    return ZSTD_initDStream_usingDDict(zds, NULL);
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
+    return ZSTD_startingInputLength(zds->format);
 }
 
 /* ZSTD_initDStream_usingDDict() :
@@ -1589,20 +1664,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
  * this function cannot fail */
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
 {
+    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
     return ZSTD_startingInputLength(dctx->format);
 }
 
-/* ZSTD_resetDStream() :
- * return : expected size, aka ZSTD_startingInputLength().
- * this function cannot fail */
-size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
-{
-    FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
-    return ZSTD_startingInputLength(dctx->format);
-}
-
 
 size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
 {
@@ -1670,6 +1737,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
             return bounds;
+        case ZSTD_d_disableHuffmanAssembly:
+            bounds.lowerBound = 0;
+            bounds.upperBound = 1;
+            return bounds;
+
         default:;
     }
     bounds.error = ERROR(parameter_unsupported);
@@ -1710,6 +1782,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
         case ZSTD_d_refMultipleDDicts:
             *value = (int)dctx->refMultipleDDicts;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            *value = (int)dctx->disableHufAsm;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1743,6 +1818,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
             }
             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
+            dctx->disableHufAsm = value != 0;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1918,7 +1997,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 if (zds->refMultipleDDicts && zds->ddictSet) {
                     ZSTD_DCtx_selectFrameDDict(zds);
                 }
-                DEBUGLOG(5, "header size : %u", (U32)hSize);
                 if (ZSTD_isError(hSize)) {
                     return hSize;   /* error */
                 }
@@ -1932,6 +2010,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                             zds->lhSize += remainingInput;
                         }
                         input->pos = input->size;
+                        /* check first few bytes */
+                        FORWARD_IF_ERROR(
+                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+                            "First few bytes detected incorrect" );
+                        /* return hint input size */
                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
                     assert(ip != NULL);
@@ -1949,8 +2032,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
                     DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    assert(istart != NULL);
                     ip = istart + cSize;
-                    op += decompressedSize;
+                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
                     zds->expected = 0;
                     zds->streamStage = zdss_init;
                     someMoreWork = 0;
@@ -2034,6 +2118,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 }
                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    assert(ip != NULL);
                     ip += neededInSize;
                     /* Function modifies the stage so we must break */
                     break;
@@ -2048,7 +2133,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
                 size_t loadedSize;
                 /* At this point we shouldn't be decompressing a block that we can stream. */
-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
                 if (isSkipFrame) {
                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
                 } else {
@@ -2057,8 +2142,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                                     "should never happen");
                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
                 }
-                ip += loadedSize;
-                zds->inPos += loadedSize;
+                if (loadedSize != 0) {
+                    /* ip may be NULL */
+                    ip += loadedSize;
+                    zds->inPos += loadedSize;
+                }
                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
 
                 /* decode loaded input */
@@ -2068,14 +2156,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 break;
             }
         case zdss_flush:
-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+            {
+                size_t const toFlushSize = zds->outEnd - zds->outStart;
                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
-                op += flushedSize;
+
+                op = op ? op + flushedSize : op;
+
                 zds->outStart += flushedSize;
                 if (flushedSize == toFlushSize) {  /* flush completed */
                     zds->streamStage = zdss_read;
                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
                                 (int)(zds->outBuffSize - zds->outStart),
                                 (U32)zds->fParams.blockSizeMax);
@@ -2089,7 +2180,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
 
         default:
             assert(0);    /* impossible */
-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }   }
 
     /* result */
@@ -2102,8 +2193,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
         zds->noForwardProgress ++;
         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
+            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
             assert(0);
         }
     } else {
@@ -2140,11 +2231,17 @@ size_t ZSTD_decompressStream_simpleArgs (
                             void* dst, size_t dstCapacity, size_t* dstPos,
                       const void* src, size_t srcSize, size_t* srcPos)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
+    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
index c1913b8e7c89..9f5577e5bc19 100644
--- a/lib/zstd/decompress/zstd_decompress_block.c
+++ b/lib/zstd/decompress/zstd_decompress_block.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +21,12 @@
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/zstd_internal.h"
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 /*_*******************************************************
 *  Macros
@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
         }
         else {
-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
+            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
         }
@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
             ZSTD_FALLTHROUGH;
 
         case set_compressed:
-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
             {   size_t lhSize, litSize, litCSize;
                 U32 singleStream=0;
                 U32 const lhlCode = (istart[0] >> 2) & 3;
                 U32 const lhc = MEM_readLE32(istart);
                 size_t hufSuccess;
                 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                int const flags = 0
+                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
+                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
                 switch(lhlCode)
                 {
                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                if (!singleStream)
+                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
+                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
+                        litSize, MIN_LITERALS_FOR_4_STREAMS);
                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
                 if (litEncType==set_repeat) {
                     if (singleStream) {
-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+                        hufSuccess = HUF_decompress1X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     } else {
-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
+                        hufSuccess = HUF_decompress4X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     }
                 } else {
                     if (singleStream) {
@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         hufSuccess = HUF_decompress1X_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace));
+                            sizeof(dctx->workspace), flags);
 #else
-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
 #endif
                     } else {
-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
                     }
                 }
                 if (dctx->litBufferLocation == ZSTD_split)
@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
                     litSize = MEM_readLE24(istart) >> 4;
                     break;
                 }
@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 1:
                     lhSize = 2;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
                     litSize = MEM_readLE16(istart) >> 4;
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
                     litSize = MEM_readLE24(istart) >> 4;
-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
                     break;
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
                 for (i = 8; i < n; i += 8) {
                     MEM_write64(spread + pos + i, sv);
                 }
-                pos += n;
+                assert(n>=0);
+                pos += (size_t)n;
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
             for (i=0; i<n; i++) {
                 tableDecode[position].baseValue = s;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
         }   }
         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
     }
@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
         for (u=0; u<tableSize; u++) {
             U32 const symbol = tableDecode[u].baseValue;
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
             assert(nbAdditionalBits[symbol] < 255);
             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
 
     assert(op != NULL /* Precondition */);
     assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+    /* prefetch sequence starting from match that will be used for copy later */
+    PREFETCH_L1(match);
+#endif
     /* Handle edge cases in a slow path:
      *   - Read beyond end of literals
      *   - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
 }
 
 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
  * bits before reloading. This value is the maximum number of bytes we read
  * after reloading when we are decoding long offsets.
  */
@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
 {
     seq_t seq;
+    /*
+     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
+     * loaded in one operation and extracted its fields by simply shifting or
+     * bit-extracting on aarch64.
+     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+     * operations that cause performance drop. This can be avoided by using this
+     * ZSTD_memcpy hack.
+     */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
     seq.matchLength = mlDInfo->baseValue;
     seq.litLength = llDInfo->baseValue;
     {   U32 const ofBase = ofDInfo->baseValue;
@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         U32 const llnbBits = llDInfo->nbBits;
         U32 const mlnbBits = mlDInfo->nbBits;
         U32 const ofnbBits = ofDInfo->nbBits;
+
+        assert(llBits <= MaxLLBits);
+        assert(mlBits <= MaxMLBits);
+        assert(ofBits <= MaxOff);
         /*
          * As gcc has better branch and block analyzers, sometimes it is only
-         * valuable to mark likelyness for clang, it gives around 3-4% of
+         * valuable to mark likeliness for clang, it gives around 3-4% of
          * performance.
          */
 
         /* sequence */
         {   size_t offset;
-    #if defined(__clang__)
-            if (LIKELY(ofBits > 1)) {
-    #else
             if (ofBits > 1) {
-    #endif
                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
-                assert(ofBits <= MaxOff);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                    /* Always read extra bits, this keeps the logic simple,
+                     * avoids branches, and avoids accidentally reading 0 bits.
+                     */
+                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
                     BIT_reloadDStream(&seqState->DStream);
-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
                 } else {
                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
             seq.offset = offset;
         }
 
-    #if defined(__clang__)
-        if (UNLIKELY(mlBits > 0))
-    #else
         if (mlBits > 0)
-    #endif
             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
 
         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
-    #if defined(__clang__)
-        if (UNLIKELY(llBits > 0))
-    #else
         if (llBits > 0)
-    #endif
             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
 
         if (MEM_32bits())
@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
     (void)frame;
 
     /* Regen sequences */
@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
 
 
+/*
+ * @returns The total size of the history referenceable by zstd, including
+ * both the prefix and the extDict. At @p op any offset larger than this
+ * is invalid.
+ */
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
+{
+    return (size_t)(op - virtualStart);
+}
+
+typedef struct {
+    unsigned longOffsetShare;
+    unsigned maxNbAdditionalBits;
+} ZSTD_OffsetInfo;
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-/* ZSTD_getLongOffsetsShare() :
+/* ZSTD_getOffsetInfo() :
  * condition : offTable must be valid
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
- *           compared to maximum possible of (1<<OffFSELog) */
-static unsigned
-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+ *           compared to maximum possible of (1<<OffFSELog),
+ *           as well as the maximum number additional bits required.
+ */
+static ZSTD_OffsetInfo
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
 {
-    const void* ptr = offTable;
-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
-    const ZSTD_seqSymbol* table = offTable + 1;
-    U32 const max = 1 << tableLog;
-    U32 u, total = 0;
-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
-
-    assert(max <= (1 << OffFSELog));  /* max not too large */
-    for (u=0; u<max; u++) {
-        if (table[u].nbAdditionalBits > 22) total += 1;
+    ZSTD_OffsetInfo info = {0, 0};
+    /* If nbSeq == 0, then the offTable is uninitialized, but we have
+     * no sequences, so both values should be 0.
+     */
+    if (nbSeq != 0) {
+        const void* ptr = offTable;
+        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+        const ZSTD_seqSymbol* table = offTable + 1;
+        U32 const max = 1 << tableLog;
+        U32 u;
+        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+        assert(max <= (1 << OffFSELog));  /* max not too large */
+        for (u=0; u<max; u++) {
+            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
+            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
+        }
+
+        assert(tableLog <= OffFSELog);
+        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
     }
 
-    assert(tableLog <= OffFSELog);
-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+    return info;
+}
 
-    return total;
+/*
+ * @returns The maximum offset we can decode in one read of our bitstream, without
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
+ * than this must use the long offset decoder.
+ */
+static size_t ZSTD_maxShortOffset(void)
+{
+    if (MEM_64bits()) {
+        /* We can decode any offset without reloading bits.
+         * This might change if the max window size grows.
+         */
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        return (size_t)-1;
+    } else {
+        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
+         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
+         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
+         */
+        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
+        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
+        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
+        return maxOffset;
+    }
 }
-#endif
 
 size_t
 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
 {   /* blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
-    /* isLongOffset must be true if there are long offsets.
-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
-     * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative.
-     * (note: but it could be evaluated from current-lowLimit)
-     */
-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
 
-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+    /* Note : the wording of the specification
+     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
+     * This generally does not happen, as it makes little sense,
+     * since an uncompressed block would feature same size and have no decompression cost.
+     * Also, note that decoder from reference libzstd before < v1.5.4
+     * would consider this edge case as an error.
+     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
+     * for broader compatibility with the deployed ecosystem of zstd decoders */
+    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
 
     /* Decode literals section */
     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
         if (ZSTD_isError(litCSize)) return litCSize;
         ip += litCSize;
         srcSize -= litCSize;
@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 
     /* Build Decoding Tables */
     {
+        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
+         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
+         */
+        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
+        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
+        /* isLongOffset must be true if there are long offsets.
+         * Offsets are long if they are larger than ZSTD_maxShortOffset().
+         * We don't expect that to be the case in 64-bit mode.
+         *
+         * We check here to see if our history is large enough to allow long offsets.
+         * If it isn't, then we can't possible have (valid) long offsets. If the offset
+         * is invalid, then it is okay to read it incorrectly.
+         *
+         * If isLongOffsets is true, then we will later check our decoding table to see
+         * if it is even possible to generate long offsets.
+         */
+        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
         /* These macros control at build-time which decompressor implementation
          * we use. If neither is defined, we do some inspection and dispatch at
          * runtime.
@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
         int usePrefetchDecoder = dctx->ddictIsCold;
+#else
+        /* Set to 1 to avoid computing offset info if we don't need to.
+         * Otherwise this value is ignored.
+         */
+        int usePrefetchDecoder = 1;
 #endif
         int nbSeq;
         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
         ip += seqHSize;
         srcSize -= seqHSize;
 
-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
+                "invalid dst");
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if ( !usePrefetchDecoder
-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            usePrefetchDecoder = (shareLongOffsets >= minShare);
+        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
+         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
+         * NOTE: could probably use a larger nbSeq limit
+         */
+        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
+            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
+            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
+                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
+                 * enough, then we know it is impossible to have too long an offset in this block, so we can
+                 * use the regular offset decoder.
+                 */
+                isLongOffset = ZSTD_lo_isRegularOffset;
+            }
+            if (!usePrefetchDecoder) {
+                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+                usePrefetchDecoder = (info.longOffsetShare >= minShare);
+            }
         }
-#endif
 
         dctx->ddictIsCold = 0;
 
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if (usePrefetchDecoder)
+        if (usePrefetchDecoder) {
+#else
+        (void)usePrefetchDecoder;
+        {
 #endif
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
             return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
 #endif
+        }
 
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
         /* else */
@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
 }
 
 
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
-                            void* dst, size_t dstCapacity,
-                      const void* src, size_t srcSize)
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
 {
     size_t dSize;
     ZSTD_checkContinuity(dctx, dst, dstCapacity);
@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
     dctx->previousDstEnd = (char*)dst + dSize;
     return dSize;
 }
+
+
+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
+}
diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
index 3d2d57a5d25a..5888e6cc788b 100644
--- a/lib/zstd/decompress/zstd_decompress_block.h
+++ b/lib/zstd/decompress/zstd_decompress_block.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
                    unsigned tableLog, void* wksp, size_t wkspSize,
                    int bmi2);
 
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize);
+
 
 #endif /* ZSTD_DEC_BLOCK_H */
diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
index 98102edb6a83..32f79fb2873d 100644
--- a/lib/zstd/decompress/zstd_decompress_internal.h
+++ b/lib/zstd/decompress/zstd_decompress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
 
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
 
 typedef struct {
     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
     U32 rep[ZSTD_REP_NUM];
     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
 } ZSTD_entropyDTables_t;
@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
     ZSTD_dictUses_e dictUses;
     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+    int disableHufAsm;
 
     /* streaming */
     ZSTD_dStreamStage streamStage;
diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
index a06ca187aab5..8a47eb2a4514 100644
--- a/lib/zstd/decompress_sources.h
+++ b/lib/zstd/decompress_sources.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
index 22686e367e6f..466828e35752 100644
--- a/lib/zstd/zstd_common_module.c
+++ b/lib/zstd/zstd_common_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
 EXPORT_SYMBOL_GPL(ZSTD_isError);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customFree);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("Zstd Common");
diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
index 04e1b5c01d9b..8ecf43226af2 100644
--- a/lib/zstd/zstd_compress_module.c
+++ b/lib/zstd/zstd_compress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
index f4ed952ed485..7d31518e9d5a 100644
--- a/lib/zstd/zstd_decompress_module.c
+++ b/lib/zstd/zstd_decompress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
 
 size_t zstd_reset_dstream(zstd_dstream *dstream)
 {
-	return ZSTD_resetDStream(dstream);
+	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
 }
 EXPORT_SYMBOL(zstd_reset_dstream);
 
-- 
2.41.0