From 9ee19b282653761511acbde09e77416a96f55a5b Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 10 Mar 2023 17:59:47 +0100 Subject: [PATCH 01/15] bbr2 Signed-off-by: Peter Jung --- include/linux/tcp.h | 3 +- include/net/inet_connection_sock.h | 3 +- include/net/tcp.h | 41 +- include/uapi/linux/inet_diag.h | 33 + net/ipv4/Kconfig | 22 + net/ipv4/Makefile | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_bbr.c | 38 +- net/ipv4/tcp_bbr2.c | 2674 ++++++++++++++++++++++++++++ net/ipv4/tcp_cong.c | 1 + net/ipv4/tcp_input.c | 27 +- net/ipv4/tcp_output.c | 26 +- net/ipv4/tcp_rate.c | 30 +- net/ipv4/tcp_timer.c | 1 + 14 files changed, 2867 insertions(+), 34 deletions(-) create mode 100644 net/ipv4/tcp_bbr2.c diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca7f05a130d2..09dbcd67ee8e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -255,7 +255,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + fast_ack_mode:2, /* which fast ack mode ? */ + unused:3; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..d85858efa571 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -135,7 +135,8 @@ struct inet_connection_sock { u32 icsk_probes_tstamp; u32 icsk_user_timeout; - u64 icsk_ca_priv[104 / sizeof(u64)]; +/* XXX inflated by temporary internal debugging info */ + u64 icsk_ca_priv[216 / sizeof(u64)]; #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) }; diff --git a/include/net/tcp.h b/include/net/tcp.h index db9f828e9d1e..e1f05c2b4707 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 +#define TCP_ECN_ECT_PERMANENT 16 enum tcp_tw_status { TCP_TW_SUCCESS = 0, @@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } +static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) +{ + return max_t(s32, t1 - t0, 0); +} + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); @@ -898,9 +904,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ - u64 first_tx_mstamp; + u32 first_tx_mstamp; /* when we reached the "delivered" count */ - u64 delivered_mstamp; + u32 delivered_mstamp; +#define TCPCB_IN_FLIGHT_BITS 20 +#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) + u32 in_flight:20, /* packets in flight at transmit */ + unused2:12; + u32 lost; /* packets lost so far upon tx of skb */ } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) +/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ +#define TCP_CONG_WANTS_CE_EVENTS 0x4 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ + TCP_CONG_NEEDS_ECN | \ + TCP_CONG_WANTS_CE_EVENTS) union tcp_cc_info; @@ -1046,8 +1061,11 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_lost; /* tp->lost at "prior_mstamp" */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 lost; /* number of packets lost over interval */ s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ @@ -1061,6 +1079,7 @@ struct rate_sample { bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ + bool is_ece; /* did this ACK have ECN marked? */ }; struct tcp_congestion_ops { @@ -1084,8 +1103,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - u32 (*min_tso_segs)(struct sock *sk); + /* pick target number of segments per TSO/GSO skb (optional): */ + u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); + + /* react to a specific lost skb (optional) */ + void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) @@ -1148,6 +1170,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif +static inline bool tcp_ca_wants_ce_events(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | + TCP_CONG_WANTS_CE_EVENTS); +} + static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1167,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 50655de04c9b..0e24f11627d5 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -231,9 +231,42 @@ struct tcp_bbr_info { __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ }; +/* Phase as reported in netlink/ss stats. */ +enum tcp_bbr2_phase { + BBR2_PHASE_INVALID = 0, + BBR2_PHASE_STARTUP = 1, + BBR2_PHASE_DRAIN = 2, + BBR2_PHASE_PROBE_RTT = 3, + BBR2_PHASE_PROBE_BW_UP = 4, + BBR2_PHASE_PROBE_BW_DOWN = 5, + BBR2_PHASE_PROBE_BW_CRUISE = 6, + BBR2_PHASE_PROBE_BW_REFILL = 7 +}; + +struct tcp_bbr2_info { + /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ + __u32 bbr_bw_lsb; /* lower 32 bits of bw */ + __u32 bbr_bw_msb; /* upper 32 bits of bw */ + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ + __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ + __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ + __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ + __u8 bbr_mode; /* current bbr_mode in state machine */ + __u8 bbr_phase; /* current state machine phase */ + __u8 unused1; /* alignment padding; not used yet */ + __u8 bbr_version; /* MUST be at this offset in struct */ + __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ + __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ + __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ +}; + union tcp_cc_info { struct tcpvegas_info vegas; struct tcp_dctcp_info dctcp; struct tcp_bbr_info bbr; + struct tcp_bbr2_info bbr2; }; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 2dfb12230f08..b6bec331a82e 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -678,6 +678,24 @@ config TCP_CONG_BBR AQM schemes that do not provide a delay signal. It requires the fq ("Fair Queue") pacing packet scheduler. +config TCP_CONG_BBR2 + tristate "BBR2 TCP" + default n + help + + BBR2 TCP congestion control is a model-based congestion control + algorithm that aims to maximize network utilization, keep queues and + retransmit rates low, and to be able to coexist with Reno/CUBIC in + common scenarios. It builds an explicit model of the network path. It + tolerates a targeted degree of random packet loss and delay that are + unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, + or cable modem links, and can use DCTCP-L4S-style ECN signals. It can + coexist with flows that use loss-based congestion control, and can + operate with shallow buffers, deep buffers, bufferbloat, policers, or + AQM schemes that do not provide a delay signal. It requires pacing, + using either TCP internal pacing or the fq ("Fair Queue") pacing packet + scheduler. + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -715,6 +733,9 @@ choice config DEFAULT_BBR bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_BBR2 + bool "BBR2" if TCP_CONG_BBR2=y + config DEFAULT_RENO bool "Reno" endchoice @@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG default "bbr" if DEFAULT_BBR + default "bbr2" if DEFAULT_BBR2 default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index af7d2cf490fb..e7a86a50838a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o +obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 33f559f491c8..e9e8040d6491 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3191,6 +3191,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; + tp->fast_ack_mode = 0; /* Clean up fastopen related fields */ diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index d2c470524e58..af08fb3cb139 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* override sysctl_tcp_min_tso_segs */ static u32 bbr_min_tso_segs(struct sock *sk) { return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + u32 segs; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 segs, bytes; - - /* Sort of tcp_tso_autosize() but ignoring - * driver provided sk_gso_max_size. - */ - bytes = min_t(unsigned long, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); - segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - return min(segs, 0x7FU); + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .min_tso_segs = bbr_min_tso_segs, + .tso_segs = bbr_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c new file mode 100644 index 000000000000..85f8052144d1 --- /dev/null +++ b/net/ipv4/tcp_bbr2.c @@ -0,0 +1,2674 @@ +/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 + * + * BBRv2 is a model-based congestion control algorithm that aims for low + * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model + * of the network path, it uses measurements of bandwidth and RTT, as well as + * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that + * although it can use ECN or loss signals explicitly, it does not require + * either; it can bound its in-flight data based on its estimate of the BDP. + * + * The model has both higher and lower bounds for the operating range: + * lo: bw_lo, inflight_lo: conservative short-term lower bound + * hi: bw_hi, inflight_hi: robust long-term upper bound + * The bandwidth-probing time scale is (a) extended dynamically based on + * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by + * an interactive wall-clock time-scale to be more scalable and responsive + * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * + * | + * V + * +---> STARTUP ----+ + * | | | + * | V | + * | DRAIN ----+ + * | | | + * | V | + * +---> PROBE_BW ----+ + * | ^ | | + * | | | | + * | +----+ | + * | | + * +---- PROBE_RTT <--+ + * + * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. + * When it estimates the pipe is full, it enters DRAIN to drain the queue. + * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. + * A long-lived BBR flow spends the vast majority of its time remaining + * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth + * in a fair manner, with a small, bounded queue. *If* a flow has been + * continuously sending for the entire min_rtt window, and hasn't seen an RTT + * sample that matches or decreases its min_rtt estimate for 10 seconds, then + * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe + * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if + * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; + * otherwise we enter STARTUP to try to fill the pipe. + * + * BBR is described in detail in: + * "BBR: Congestion-Based Congestion Control", + * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, + * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. + * + * There is a public e-mail list for discussing BBR development and testing: + * https://groups.google.com/forum/#!forum/bbr-dev + * + * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, + * otherwise TCP stack falls back to an internal pacing using one high + * resolution timer per TCP socket and may use more resources. + */ +#include +#include +#include +#include +#include + +#include "tcp_dctcp.h" + +/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. + * Since the minimum window is >=4 packets, the lower bound isn't + * an issue. The upper bound isn't an issue with existing technologies. + */ +#define BW_SCALE 24 +#define BW_UNIT (1 << BW_SCALE) + +#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ +#define BBR_UNIT (1 << BBR_SCALE) + +#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ +#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ + +#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ + +/* BBR has the following modes for deciding how fast to send: */ +enum bbr_mode { + BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ + BBR_DRAIN, /* drain any queue created during startup */ + BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ +}; + +/* How does the incoming ACK stream relate to our bandwidth probing? */ +enum bbr_ack_phase { + BBR_ACKS_INIT, /* not probing; not getting probe feedback */ + BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ + BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ + BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ + BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ +}; + +/* BBR congestion control block */ +struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ + u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ + u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ + u64 cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ + ce_state:1, /* If most recent data has CE bit set */ + bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ + try_fast_path:1, /* can we take fast path? */ + unused2:11, + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ + cycle_idx:3, /* current index in pacing_gain cycle array */ + has_seen_rtt:1; /* have we seen an RTT sample yet? */ + u32 pacing_gain:11, /* current gain for setting pacing rate */ + cwnd_gain:11, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ + init_cwnd:7; /* initial cwnd */ + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + /* BBR v2 state: */ + unused1:2, + startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ + loss_in_cycle:1, /* packet loss in this cycle? */ + ecn_in_cycle:1; /* ECN in this cycle? */ + u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ + u32 undo_bw_lo; /* bw_lo before latest losses */ + u32 undo_inflight_lo; /* inflight_lo before latest losses */ + u32 undo_inflight_hi; /* inflight_hi before latest losses */ + u32 bw_latest; /* max delivered bw in last round trip */ + u32 bw_lo; /* lower bound on sending bandwidth */ + u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ + u32 inflight_latest; /* max delivered data in last round trip */ + u32 inflight_lo; /* lower bound of inflight data range */ + u32 inflight_hi; /* upper bound of inflight data range */ + u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ + u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ + u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ + u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ + ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ + bw_probe_samples:1, /* rate samples reflect bw probing? */ + prev_probe_too_high:1, /* did last PROBE_UP go too high? */ + stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ + rounds_since_probe:8, /* packet-timed rounds since probed bw */ + loss_round_start:1, /* loss_round_delivered round trip? */ + loss_in_round:1, /* loss marked in this round trip? */ + ecn_in_round:1, /* ECN marked in this round trip? */ + ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ + loss_events_in_round:4,/* losses in STARTUP round */ + initialized:1; /* has bbr_init() been called? */ + u32 alpha_last_delivered; /* tp->delivered at alpha update */ + u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ + + /* Params configurable using setsockopt. Refer to correspoding + * module param for detailed description of params. + */ + struct bbr_params { + u32 high_gain:11, /* max allowed value: 2047 */ + drain_gain:10, /* max allowed value: 1023 */ + cwnd_gain:11; /* max allowed value: 2047 */ + u32 cwnd_min_target:4, /* max allowed value: 15 */ + min_rtt_win_sec:5, /* max allowed value: 31 */ + probe_rtt_mode_ms:9, /* max allowed value: 511 */ + full_bw_cnt:3, /* max allowed value: 7 */ + cwnd_tso_budget:1, /* allowed values: {0, 1} */ + unused3:6, + drain_to_target:1, /* boolean */ + precise_ece_ack:1, /* boolean */ + extra_acked_in_startup:1, /* allowed values: {0, 1} */ + fast_path:1; /* boolean */ + u32 full_bw_thresh:10, /* max allowed value: 1023 */ + startup_cwnd_gain:11, /* max allowed value: 2047 */ + bw_probe_pif_gain:9, /* max allowed value: 511 */ + usage_based_cwnd:1, /* boolean */ + unused2:1; + u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ + refill_add_inc:2; /* max allowed value: 3 */ + u16 extra_acked_gain:11, /* max allowed value: 2047 */ + extra_acked_win_rtts:5; /* max allowed value: 31*/ + u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ + /* Mostly BBR v2 parameters below here: */ + u32 ecn_alpha_gain:8, /* max allowed value: 255 */ + ecn_factor:8, /* max allowed value: 255 */ + ecn_thresh:8, /* max allowed value: 255 */ + beta:8; /* max allowed value: 255 */ + u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ + bw_probe_reno_gain:9, /* max allowed value: 511 */ + full_loss_cnt:4; /* max allowed value: 15 */ + u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ + inflight_headroom:8, /* max allowed value: 255 */ + loss_thresh:8, /* max allowed value: 255 */ + bw_probe_max_rounds:8; /* max allowed value: 255 */ + u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ + bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ + full_ecn_cnt:2; /* max allowed value: 3 */ + u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ + undo:1, /* boolean */ + tso_rtt_shift:4, /* max allowed value: 15 */ + unused5:1; + u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ + unused1:14, + ecn_alpha_init:9; /* max allowed value: 256 */ + } params; + + struct { + u32 snd_isn; /* Initial sequence number */ + u32 rs_bw; /* last valid rate sample bw */ + u32 target_cwnd; /* target cwnd, based on BDP */ + u8 undo:1, /* Undo even happened but not yet logged */ + unused:7; + char event; /* single-letter event debug codes */ + u16 unused2; + } debug; +}; + +struct bbr_context { + u32 sample_bw; + u32 target_cwnd; + u32 log:1; +}; + +/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ +static u32 bbr_min_rtt_win_sec = 10; +/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. + * Max allowed value is 511 (0x1FF). + */ +static u32 bbr_probe_rtt_mode_ms = 200; +/* Window length of probe_rtt_min_us filter (in ms), and consequently the + * typical interval between PROBE_RTT mode entries. + * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC + */ +static u32 bbr_probe_rtt_win_ms = 5000; +/* Skip TSO below the following bandwidth (bits/sec): */ +static int bbr_min_tso_rate = 1200000; + +/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting + * in bigger TSO bursts. By default we cut the RTT-based allowance in half + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance + * is below 1500 bytes after 6 * ~500 usec = 3ms. + */ +static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ + +/* Select cwnd TSO budget approach: + * 0: padding + * 1: flooring + */ +static uint bbr_cwnd_tso_budget = 1; + +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ +static const int bbr_pacing_margin_percent = 1; + +/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). + */ +static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ +static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round. Max allowed value + * is 1023 (0x3FF). + */ +static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. + * Max allowed value is 2047 (0x7FF). + */ +static int bbr_cwnd_gain = BBR_UNIT * 2; +/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. + * Max allowed value for each element is 1023 (0x3FF). + */ +enum bbr_pacing_gain_phase { + BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ + BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ + BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ + BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ +}; +static int bbr_pacing_gain[] = { + BBR_UNIT * 5 / 4, /* probe for more available bw */ + BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ + BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ + BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ +}; + +/* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet + * needs at least 4 packets in flight. Max allowed value is 15 (0xF). + */ +static u32 bbr_cwnd_min_target = 4; + +/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. + * Use 0 to disable. Max allowed value is 255. + */ +static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; + +/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* If bw has increased significantly (1.25x), there may be more bw available. + * Max allowed value is 1023 (0x3FF). + */ +static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; +/* But after 3 rounds w/o significant bw growth, estimate pipe is full. + * Max allowed value is 7 (0x7). + */ +static u32 bbr_full_bw_cnt = 3; + +static u32 bbr_flags; /* Debugging related stuff */ + +/* Whether to debug using printk. + */ +static bool bbr_debug_with_printk; + +/* Whether to debug using ftrace event tcp:tcp_bbr_event. + * Ignored when bbr_debug_with_printk is set. + */ +static bool bbr_debug_ftrace; + +/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ +static bool bbr_drain_to_target = true; /* default: enabled */ + +/* Experiment: Flags to control BBR with ECN behavior. + */ +static bool bbr_precise_ece_ack = true; /* default: enabled */ + +/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is + * (2^(16+14) B)/(1024 B/packet) = 1M packets. + */ +static u32 bbr_cwnd_warn_val = 1U << 20; + +static u16 bbr_debug_port_mask; + +/* BBR module parameters. These are module parameters only in Google prod. + * Upstream these are intentionally not module parameters. + */ +static int bbr_pacing_gain_size = CYCLE_LEN; + +/* Gain factor for adding extra_acked to target cwnd: */ +static int bbr_extra_acked_gain = 256; + +/* Window length of extra_acked window. Max allowed val is 31. */ +static u32 bbr_extra_acked_win_rtts = 5; + +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + +/* Time period for clamping cwnd increment due to ack aggregation */ +static u32 bbr_extra_acked_max_us = 100 * 1000; + +/* Use extra acked in startup ? + * 0: disabled + * 1: use latest extra_acked value from 1-2 rtt in startup + */ +static int bbr_extra_acked_in_startup = 1; /* default: enabled */ + +/* Experiment: don't grow cwnd beyond twice of what we just probed. */ +static bool bbr_usage_based_cwnd; /* default: disabled */ + +/* For lab testing, researchers can enable BBRv2 ECN support with this flag, + * when they know that any ECN marks that the connections experience will be + * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. + * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on + * negotiation or configuration that is outside the scope of the BBRv2 + * alpha release. + */ +static bool bbr_ecn_enable = false; + +module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); +module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); +module_param_named(high_gain, bbr_high_gain, int, 0644); +module_param_named(drain_gain, bbr_drain_gain, int, 0644); +module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); +module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); +module_param_array_named(pacing_gain, bbr_pacing_gain, int, + &bbr_pacing_gain_size, 0644); +module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); +module_param_named(probe_rtt_cwnd_gain, + bbr_probe_rtt_cwnd_gain, uint, 0664); +module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); +module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); +module_param_named(flags, bbr_flags, uint, 0644); +module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); +module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); +module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); +module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); +module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); +module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); +module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); +module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); +module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); +module_param_named(extra_acked_win_rtts, + bbr_extra_acked_win_rtts, uint, 0664); +module_param_named(extra_acked_max_us, + bbr_extra_acked_max_us, uint, 0664); +module_param_named(ack_epoch_acked_reset_thresh, + bbr_ack_epoch_acked_reset_thresh, uint, 0664); +module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); +module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); +module_param_named(extra_acked_in_startup, + bbr_extra_acked_in_startup, int, 0664); +module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); +module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); + +static void bbr2_exit_probe_rtt(struct sock *sk); +static void bbr2_reset_congestion_signals(struct sock *sk); + +static void bbr_check_probe_rtt_done(struct sock *sk); + +/* Do we estimate that STARTUP filled the pipe? */ +static bool bbr_full_bw_reached(const struct sock *sk) +{ + const struct bbr *bbr = inet_csk_ca(sk); + + return bbr->full_bw_reached; +} + +/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ +static u32 bbr_max_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->bw_hi[0], bbr->bw_hi[1]); +} + +/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ +static u32 bbr_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return min(bbr_max_bw(sk), bbr->bw_lo); +} + +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + +/* Return rate in bytes per second, optionally with a gain. + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, + int margin) +{ + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; + rate *= USEC_PER_SEC / 100 * (100 - margin); + rate >>= BW_SCALE; + rate = max(rate, 1ULL); + return rate; +} + +static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) +{ + return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); +} + +static u64 bbr_rate_kbps(struct sock *sk, u64 rate) +{ + rate = bbr_bw_bytes_per_sec(sk, rate); + rate *= 8; + do_div(rate, 1000); + return rate; +} + +static u32 bbr_tso_segs_goal(struct sock *sk); +static void bbr_debug(struct sock *sk, u32 acked, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + static const char ca_states[] = { + [TCP_CA_Open] = 'O', + [TCP_CA_Disorder] = 'D', + [TCP_CA_CWR] = 'C', + [TCP_CA_Recovery] = 'R', + [TCP_CA_Loss] = 'L', + }; + static const char mode[] = { + 'G', /* Growing - BBR_STARTUP */ + 'D', /* Drain - BBR_DRAIN */ + 'W', /* Window - BBR_PROBE_BW */ + 'M', /* Min RTT - BBR_PROBE_RTT */ + }; + static const char ack_phase[] = { /* bbr_ack_phase strings */ + 'I', /* BBR_ACKS_INIT - 'Init' */ + 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ + 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ + 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ + 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ + }; + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + const u32 una = tp->snd_una - bbr->debug.snd_isn; + const u32 fack = tcp_highest_sack_seq(tp); + const u16 dport = ntohs(inet_sk(sk)->inet_dport); + bool is_port_match = (bbr_debug_port_mask && + ((dport & bbr_debug_port_mask) == 0)); + char debugmsg[320]; + + if (sk->sk_state == TCP_SYN_SENT) + return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ + + if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { + char addr[INET6_ADDRSTRLEN + 10] = { 0 }; + + if (sk->sk_family == AF_INET) + snprintf(addr, sizeof(addr), "%pI4:%u", + &inet_sk(sk)->inet_daddr, dport); + else if (sk->sk_family == AF_INET6) + snprintf(addr, sizeof(addr), "%pI6:%u", + &sk->sk_v6_daddr, dport); + + WARN_ONCE(1, + "BBR %s cwnd alert: %u " + "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " + "bw: %u rtt: %u min_rtt: %u " + "acked: %u tso_segs: %u " + "bw: %d %ld %d pif: %u\n", + addr, tp->snd_cwnd, + una, inet_csk(sk)->icsk_ca_state, + bbr->pacing_gain, bbr->cwnd_gain, + bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, + acked, bbr_tso_segs_goal(sk), + rs->delivered, rs->interval_us, rs->is_retrans, + tcp_packets_in_flight(tp)); + } + + if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) + return; + + if (!sock_flag(sk, SOCK_DBG) && !is_port_match) + return; + + if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) + return; + + if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && + !(bbr_flags & FLAG_DEBUG_LOOPBACK)) + return; + + snprintf(debugmsg, sizeof(debugmsg) - 1, + "BBR %pI4:%-5u %5u,%03u:%-7u %c " + "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " + "bw %llu lb %llu ib %llu qb %llu " + "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " + "lr %d er %d ea %d bwl %lld il %d ih %d c %d " + "v %d %c %u %c %s\n", + &inet_sk(sk)->inet_daddr, dport, + una / 1000, una % 1000, fack - tp->snd_una, + ca_states[inet_csk(sk)->icsk_ca_state], + bbr->debug.undo ? '@' : mode[bbr->mode], + tp->snd_cwnd, + bbr_extra_acked(sk), /* br (legacy): extra_acked */ + rs->tx_in_flight, /* cr (legacy): tx_inflight */ + rs->rtt_us, + rs->delivered, + rs->interval_us, + bbr->min_rtt_us, + rs->is_app_limited ? '_' : 'l', + bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ + bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ + 0ULL, /* lb: [obsolete] */ + 0ULL, /* ib: [obsolete] */ + div_u64((u64)sk->sk_pacing_rate * 8, 1000), + acked, + tcp_packets_in_flight(tp), + rs->is_ack_delayed ? 'd' : '.', + bbr->round_start ? '*' : '.', + tp->delivered, tp->lost, + tp->app_limited, + 0, /* #: [obsolete] */ + ctx->target_cwnd, + tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ + ca_states[bbr->prev_ca_state], + (rs->lost + rs->delivered) > 0 ? + (1000 * rs->lost / + (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ + (rs->delivered) > 0 ? + (1000 * rs->delivered_ce / + (rs->delivered)) : 0, /* er: ECN rate x1000 */ + 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ + bbr->bw_lo == ~0U ? + -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ + bbr->inflight_lo, /* il */ + bbr->inflight_hi, /* ih */ + bbr->bw_probe_up_cnt, /* c */ + 2, /* v: version */ + bbr->debug.event, + bbr->cycle_idx, + ack_phase[bbr->ack_phase], + bbr->bw_probe_samples ? "Y" : "N"); + debugmsg[sizeof(debugmsg) - 1] = 0; + + /* printk takes a higher precedence. */ + if (bbr_debug_with_printk) + printk(KERN_DEBUG "%s", debugmsg); + + if (unlikely(bbr->debug.undo)) + bbr->debug.undo = 0; +} + +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain, + bbr_pacing_margin_percent); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); +} + +/* Pace using current bw estimate and a gain factor. */ +static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); + + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) + sk->sk_pacing_rate = rate; +} + +static u32 bbr_min_tso_segs(struct sock *sk) +{ + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; +} + +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 segs, r; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + /* Budget a TSO/GSO burst size allowance based on min_rtt. For every + * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. + * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) + */ + if (bbr->params.tso_rtt_shift) { + r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; + if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ + bytes += GSO_MAX_SIZE >> r; + } + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ +static u32 bbr_tso_segs_goal(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); +} + +/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +static void bbr_save_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) + bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ + bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); +} + +static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (event == CA_EVENT_TX_START && tp->app_limited) { + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + /* Avoid pointless buffer overflows: pace at est. bw if we don't + * need more speed (we're restarting from idle and app-limited). + */ + if (bbr->mode == BBR_PROBE_BW) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); + } else if ((event == CA_EVENT_ECN_IS_CE || + event == CA_EVENT_ECN_NO_CE) && + bbr_ecn_enable && + bbr->params.precise_ece_ack) { + u32 state = bbr->ce_state; + dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); + bbr->ce_state = state; + if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + } +} + +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: + * + * bdp = ceil(bw * min_rtt * gain) + * + * The key factor, gain, controls the amount of queue. While a small gain + * builds a smaller queue, it becomes more vulnerable to noise in RTT + * measurements (e.g., delayed ACKs or other ACK compression effects). This + * noise may cause BBR to under-estimate the rate. + */ +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bdp; + u64 w; + + /* If we've never had a valid RTT sample, cap cwnd at the initial + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which + * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + + /* Apply a gain to the given value, remove the BW_SCALE shift, and + * round the value up to avoid a negative feedback loop. + */ + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 tso_segs_goal; + + tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + + /* Allow enough full-sized skbs in flight to utilize end systems. */ + if (bbr->params.cwnd_tso_budget == 1) { + cwnd = max_t(u32, cwnd, tso_segs_goal); + cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); + } else { + cwnd += tso_segs_goal; + cwnd = (cwnd + 1) & ~1U; + } + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +} + +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight); + + return inflight; +} + +/* With pacing at lower layers, there's often less data "in the network" than + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), + * we often have several skbs queued in the pacing layer with a pre-scheduled + * earliest departure time (EDT). BBR adapts its pacing rate based on the + * inflight level that it estimates has already been "baked in" by previous + * departure time decisions. We calculate a rough estimate of the number of our + * packets that might be in the network at the earliest departure time for the + * next skb scheduled: + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw + * If we're increasing inflight, then we want to know if the transmit of the + * EDT skb will push inflight above the target, so inflight_at_edt includes + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, + * then estimate if inflight will sink too low just before the EDT transmit. + */ +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 now_ns, edt_ns, interval_us; + u32 interval_delivered, inflight_at_edt; + + now_ns = tp->tcp_clock_cache; + edt_ns = max(tp->tcp_wstamp_ns, now_ns); + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; + inflight_at_edt = inflight_now; + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ + if (interval_delivered >= inflight_at_edt) + return 0; + return inflight_at_edt - interval_delivered; +} + +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr->params.extra_acked_gain && + (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + +/* Returns the cwnd for PROBE_RTT mode. */ +static u32 bbr_probe_rtt_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->params.probe_rtt_cwnd_gain == 0) + return bbr->params.cwnd_min_target; + return max_t(u32, bbr->params.cwnd_min_target, + bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); +} + +/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ +static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + u32 acked, u32 bw, int gain, u32 cwnd, + struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ + bbr->debug.target_cwnd = target_cwnd; + + /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ + bbr->try_fast_path = 0; + if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ + cwnd += acked; + if (cwnd >= target_cwnd) { + cwnd = target_cwnd; + bbr->try_fast_path = 1; + } + } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { + cwnd += acked; + } else { + bbr->try_fast_path = 1; + } + + /* When growing cwnd, don't grow beyond twice what we just probed. */ + if (bbr->params.usage_based_cwnd) { + max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); + cwnd = min(cwnd, max_probe); + } + + cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); +done: + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); + + ctx->target_cwnd = target_cwnd; + ctx->log = (tp->snd_cwnd != prev_cwnd); +} + +/* See if we have reached next round trip */ +static void bbr_update_round_start(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->round_start = 0; + + /* See if we've reached the next RTT */ + if (rs->interval_us > 0 && + !before(rs->prior_delivered, bbr->next_rtt_delivered)) { + bbr->next_rtt_delivered = tp->delivered; + bbr->round_start = 1; + } +} + +/* Calculate the bandwidth based on how fast packets are delivered */ +static void bbr_calculate_bw_sample(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + * Round up to allow growth at low rates, even with integer division. + */ + if (rs->interval_us > 0) { + if (WARN_ONCE(rs->delivered < 0, + "negative delivered: %d interval_us: %ld\n", + rs->delivered, rs->interval_us)) + return; + + bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } + + ctx->sample_bw = bw; + bbr->debug.rs_bw = bw; +} + +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips for non-startup phase, and 1-2 round trips for startup. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; + + if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->params.extra_acked_in_startup && + !bbr_full_bw_reached(sk)) + extra_acked_win_rtts_thresh = 1; + if (bbr->extra_acked_win_rtts >= + extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh; + + if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) + return; + + bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; + if (bbr_max_bw(sk) >= bw_thresh) { + bbr->full_bw = bbr_max_bw(sk); + bbr->full_bw_cnt = 0; + return; + } + ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + tcp_sk(sk)->snd_ssthresh = + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); + bbr2_reset_congestion_signals(sk); + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) + return true; /* exiting DRAIN now */ + return false; +} + +static void bbr_check_probe_rtt_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (!(bbr->probe_rtt_done_stamp && + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + + bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr2_exit_probe_rtt(sk); +} + +/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and + * periodically drain the bottleneck queue, to converge to measure the true + * min_rtt (unloaded propagation delay). This allows the flows to keep queues + * small (reducing queuing delay and packet loss) and achieve fairness among + * BBR flows. + * + * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, + * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed + * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * re-enter the previous mode. BBR uses 200ms to approximately bound the + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * + * Note that flows need only pay 2% if they are busy sending over the last 10 + * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have + * natural silences or low-rate periods within 10 seconds where the rate is low + * enough for long enough to drain its queue in the bottleneck. We pick up + * these min RTT measurements opportunistically with our min_rtt filter. :-) + */ +static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool probe_rtt_expired, min_rtt_expired; + u32 expire; + + /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ + expire = bbr->probe_rtt_min_stamp + + msecs_to_jiffies(bbr->params.probe_rtt_win_ms); + probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->probe_rtt_min_us || + (probe_rtt_expired && !rs->is_ack_delayed))) { + bbr->probe_rtt_min_us = rs->rtt_us; + bbr->probe_rtt_min_stamp = tcp_jiffies32; + } + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; + min_rtt_expired = after(tcp_jiffies32, expire); + if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || + min_rtt_expired) { + bbr->min_rtt_us = bbr->probe_rtt_min_us; + bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + + if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { + /* Ignore low rate samples during this mode. */ + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + + msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done) + bbr_check_probe_rtt_done(sk); + } + } + /* Restart after idle ends only once we process a new S/ACK for data */ + if (rs->delivered > 0) + bbr->idle_restart = 0; +} + +static void bbr_update_gains(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + switch (bbr->mode) { + case BBR_STARTUP: + bbr->pacing_gain = bbr->params.high_gain; + bbr->cwnd_gain = bbr->params.startup_cwnd_gain; + break; + case BBR_DRAIN: + bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ + bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ + break; + case BBR_PROBE_BW: + bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; + bbr->cwnd_gain = bbr->params.cwnd_gain; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + break; + default: + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); + break; + } +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + int i; + + WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); + + bbr->initialized = 1; + bbr->params.high_gain = min(0x7FF, bbr_high_gain); + bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); + bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); + bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); + bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); + bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); + bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); + bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); + bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); + bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); + bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); + bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); + bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; + bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; + bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; + bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); + bbr->params.probe_rtt_win_ms = + min(0x3FFFU, + min_t(u32, bbr_probe_rtt_win_ms, + bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); + for (i = 0; i < CYCLE_LEN; i++) + bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); + bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; + bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); + + bbr->debug.snd_isn = tp->snd_una; + bbr->debug.target_cwnd = 0; + bbr->debug.undo = 0; + + bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); + bbr->prior_cwnd = tp->prior_cwnd; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + bbr->next_rtt_delivered = 0; + bbr->prev_ca_state = TCP_CA_Open; + bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->probe_rtt_min_us = tcp_min_rtt(tp); + bbr->probe_rtt_min_stamp = tcp_jiffies32; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_jiffies32; + + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); + + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp = 0; + bbr->cycle_idx = 0; + bbr->mode = BBR_STARTUP; + bbr->debug.rs_bw = 0; + + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + + bbr->ce_state = 0; + bbr->prior_rcv_nxt = tp->rcv_nxt; + bbr->try_fast_path = 0; + + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); +} + +static u32 bbr_sndbuf_expand(struct sock *sk) +{ + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; +} + +/* __________________________________________________________________________ + * + * Functions new to BBR v2 ("bbr") congestion control are below here. + * __________________________________________________________________________ + */ + +/* Incorporate a new bw sample into the current window of our max filter. */ +static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); +} + +/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ +static void bbr2_advance_bw_hi_filter(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (!bbr->bw_hi[1]) + return; /* no samples in this window; remember old window */ + bbr->bw_hi[0] = bbr->bw_hi[1]; + bbr->bw_hi[1] = 0; +} + +/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ +static u32 bbr2_target_inflight(struct sock *sk) +{ + u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + + return min(bdp, tcp_sk(sk)->snd_cwnd); +} + +static bool bbr2_is_probing_bandwidth(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return (bbr->mode == BBR_STARTUP) || + (bbr->mode == BBR_PROBE_BW && + (bbr->cycle_idx == BBR_BW_PROBE_REFILL || + bbr->cycle_idx == BBR_BW_PROBE_UP)); +} + +/* Has the given amount of time elapsed since we marked the phase start? */ +static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bbr *bbr = inet_csk_ca(sk); + + return tcp_stamp_us_delta(tp->tcp_mstamp, + bbr->cycle_mstamp + interval_us) > 0; +} + +static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw_reached = 1; + bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +} + +/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ +static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || + !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) + return; + + if (ce_ratio >= bbr->params.ecn_thresh) + bbr->startup_ecn_rounds++; + else + bbr->startup_ecn_rounds = 0; + + if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { + bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ + bbr2_handle_queue_too_high_in_startup(sk); + return; + } +} + +static void bbr2_update_ecn_alpha(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + s32 delivered, delivered_ce; + u64 alpha, ce_ratio; + u32 gain; + + if (bbr->params.ecn_factor == 0) + return; + + delivered = tp->delivered - bbr->alpha_last_delivered; + delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; + + if (delivered == 0 || /* avoid divide by zero */ + WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ + return; + + /* See if we should use ECN sender logic for this connection. */ + if (!bbr->ecn_eligible && bbr_ecn_enable && + (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || + !bbr->params.ecn_max_rtt_us)) + bbr->ecn_eligible = 1; + + ce_ratio = (u64)delivered_ce << BBR_SCALE; + do_div(ce_ratio, delivered); + gain = bbr->params.ecn_alpha_gain; + alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; + alpha += (gain * ce_ratio) >> BBR_SCALE; + bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); + + bbr->alpha_last_delivered = tp->delivered; + bbr->alpha_last_delivered_ce = tp->delivered_ce; + + bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); +} + +/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ +static void bbr2_raise_inflight_hi_slope(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 growth_this_round, cnt; + + /* Calculate "slope": packets S/Acked per inflight_hi increment. */ + growth_this_round = 1 << bbr->bw_probe_up_rounds; + bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); + cnt = tp->snd_cwnd / growth_this_round; + cnt = max(cnt, 1U); + bbr->bw_probe_up_cnt = cnt; + bbr->debug.event = 'G'; /* Grow inflight_hi slope */ +} + +/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ +static void bbr2_probe_inflight_hi_upward(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 delta; + + if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { + bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ + return; /* not fully using inflight_hi, so don't grow it */ + } + + /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ + bbr->bw_probe_up_acks += rs->acked_sacked; + if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { + delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; + bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; + bbr->inflight_hi += delta; + bbr->debug.event = 'I'; /* Increment inflight_hi */ + } + + if (bbr->round_start) + bbr2_raise_inflight_hi_slope(sk); +} + +/* Does loss/ECN rate for this sample say inflight is "too high"? + * This is used by both the bbr_check_loss_too_high_in_startup() function, + * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which + * uses it to notice when loss/ECN rates suggest inflight is too high. + */ +static bool bbr2_is_inflight_too_high(const struct sock *sk, + const struct rate_sample *rs) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh, ecn_thresh; + + if (rs->lost > 0 && rs->tx_in_flight) { + loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> + BBR_SCALE; + if (rs->lost > loss_thresh) + return true; + } + + if (rs->delivered_ce > 0 && rs->delivered > 0 && + bbr->ecn_eligible && bbr->params.ecn_thresh) { + ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> + BBR_SCALE; + if (rs->delivered_ce >= ecn_thresh) + return true; + } + + return false; +} + +/* Calculate the tx_in_flight level that corresponded to excessive loss. + * We find "lost_prefix" segs of the skb where loss rate went too high, + * by solving for "lost_prefix" in the following equation: + * lost / inflight >= loss_thresh + * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh + * Then we take that equation, convert it to fixed point, and + * round up to the nearest packet. + */ +static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, + const struct rate_sample *rs, + const struct sk_buff *skb) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh = bbr->params.loss_thresh; + u32 pcount, divisor, inflight_hi; + s32 inflight_prev, lost_prev; + u64 loss_budget, lost_prefix; + + pcount = tcp_skb_pcount(skb); + + /* How much data was in flight before this skb? */ + inflight_prev = rs->tx_in_flight - pcount; + if (WARN_ONCE(inflight_prev < 0, + "tx_in_flight: %u pcount: %u reneg: %u", + rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) + return ~0U; + + /* How much inflight data was marked lost before this skb? */ + lost_prev = rs->lost - pcount; + if (WARN_ON_ONCE(lost_prev < 0)) + return ~0U; + + /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ + loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; + loss_budget >>= BBR_SCALE; + if (lost_prev >= loss_budget) { + lost_prefix = 0; /* previous losses crossed loss_thresh */ + } else { + lost_prefix = loss_budget - lost_prev; + lost_prefix <<= BBR_SCALE; + divisor = BBR_UNIT - loss_thresh; + if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ + return ~0U; + do_div(lost_prefix, divisor); + } + + inflight_hi = inflight_prev + lost_prefix; + return inflight_hi; +} + +/* If loss/ECN rates during probing indicated we may have overfilled a + * buffer, return an operating point that tries to leave unutilized headroom in + * the path for other flows, for fairness convergence and lower RTTs and loss. + */ +static u32 bbr2_inflight_with_headroom(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 headroom, headroom_fraction; + + if (bbr->inflight_hi == ~0U) + return ~0U; + + headroom_fraction = bbr->params.inflight_headroom; + headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; + headroom = max(headroom, 1U); + return max_t(s32, bbr->inflight_hi - headroom, + bbr->params.cwnd_min_target); +} + +/* Bound cwnd to a sensible level, based on our current probing state + * machine phase and model of a good inflight level (inflight_lo, inflight_hi). + */ +static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cap; + + /* tcp_rcv_synsent_state_process() currently calls tcp_ack() + * and thus cong_control() without first initializing us(!). + */ + if (!bbr->initialized) + return; + + cap = ~0U; + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { + /* Probe to see if more packets fit in the path. */ + cap = bbr->inflight_hi; + } else { + if (bbr->mode == BBR_PROBE_RTT || + (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) + cap = bbr2_inflight_with_headroom(sk); + } + /* Adapt to any loss/ECN since our last bw probe. */ + cap = min(cap, bbr->inflight_lo); + + cap = max_t(u32, cap, bbr->params.cwnd_min_target); + tp->snd_cwnd = min(cap, tp->snd_cwnd); +} + +/* Estimate a short-term lower bound on the capacity available now, based + * on measurements of the current delivery process and recent history. When we + * are seeing loss/ECN at times when we are not probing bw, then conservatively + * move toward flow balance by multiplicatively cutting our short-term + * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a + * multiplicative decrease in order to converge to a lower capacity in time + * logarithmic in the magnitude of the decrease. + * + * However, we do not cut our short-term estimates lower than the current rate + * and volume of delivered data from this round trip, since from the current + * delivery process we can estimate the measured capacity available now. + * + * Anything faster than that approach would knowingly risk high loss, which can + * cause low bw for Reno/CUBIC and high loss recovery latency for + * request/response flows using any congestion control. + */ +static void bbr2_adapt_lower_bounds(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 ecn_cut, ecn_inflight_lo, beta; + + /* We only use lower-bound estimates when not probing bw. + * When probing we need to push inflight higher to probe bw. + */ + if (bbr2_is_probing_bandwidth(sk)) + return; + + /* ECN response. */ + if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { + /* Reduce inflight to (1 - alpha*ecn_factor). */ + ecn_cut = (BBR_UNIT - + ((bbr->ecn_alpha * bbr->params.ecn_factor) >> + BBR_SCALE)); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tp->snd_cwnd; + ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; + } else { + ecn_inflight_lo = ~0U; + } + + /* Loss response. */ + if (bbr->loss_in_round) { + /* Reduce bw and inflight to (1 - beta). */ + if (bbr->bw_lo == ~0U) + bbr->bw_lo = bbr_max_bw(sk); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tp->snd_cwnd; + beta = bbr->params.beta; + bbr->bw_lo = + max_t(u32, bbr->bw_latest, + (u64)bbr->bw_lo * + (BBR_UNIT - beta) >> BBR_SCALE); + bbr->inflight_lo = + max_t(u32, bbr->inflight_latest, + (u64)bbr->inflight_lo * + (BBR_UNIT - beta) >> BBR_SCALE); + } + + /* Adjust to the lower of the levels implied by loss or ECN. */ + bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); +} + +/* Reset any short-term lower-bound adaptation to congestion, so that we can + * push our inflight up. + */ +static void bbr2_reset_lower_bounds(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_lo = ~0U; + bbr->inflight_lo = ~0U; +} + +/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state + * machine phase where we adapt our lower bound based on congestion signals. + */ +static void bbr2_reset_congestion_signals(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->loss_in_cycle = 0; + bbr->ecn_in_cycle = 0; + bbr->bw_latest = 0; + bbr->inflight_latest = 0; +} + +/* Update (most of) our congestion signals: track the recent rate and volume of + * delivered data, presence of loss, and EWMA degree of ECN marking. + */ +static void bbr2_update_congestion_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->loss_round_start = 0; + if (rs->interval_us <= 0 || !rs->acked_sacked) + return; /* Not a valid observation */ + bw = ctx->sample_bw; + + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) + bbr2_take_bw_hi_sample(sk, bw); + + bbr->loss_in_round |= (rs->losses > 0); + + /* Update rate and volume of delivered data from latest round trip: */ + bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); + bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); + + if (before(rs->prior_delivered, bbr->loss_round_delivered)) + return; /* skip the per-round-trip updates */ + /* Now do per-round-trip updates. */ + bbr->loss_round_delivered = tp->delivered; /* mark round trip */ + bbr->loss_round_start = 1; + bbr2_adapt_lower_bounds(sk); + + /* Update windowed "latest" (single-round-trip) filters. */ + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->bw_latest = ctx->sample_bw; + bbr->inflight_latest = rs->delivered; +} + +/* Bandwidth probing can cause loss. To help coexistence with loss-based + * congestion control we spread out our probing in a Reno-conscious way. Due to + * the shape of the Reno sawtooth, the time required between loss epochs for an + * idealized Reno flow is a number of round trips that is the BDP of that + * flow. We count packet-timed round trips directly, since measured RTT can + * vary widely, and Reno is driven by packet-timed round trips. + */ +static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 inflight, rounds, reno_gain, reno_rounds; + + /* Random loss can shave some small percentage off of our inflight + * in each round. To survive this, flows need robust periodic probes. + */ + rounds = bbr->params.bw_probe_max_rounds; + + reno_gain = bbr->params.bw_probe_reno_gain; + if (reno_gain) { + inflight = bbr2_target_inflight(sk); + reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; + rounds = min(rounds, reno_rounds); + } + return bbr->rounds_since_probe >= rounds; +} + +/* How long do we want to wait before probing for bandwidth (and risking + * loss)? We randomize the wait, for better mixing and fairness convergence. + * + * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. + * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, + * (eg 4K video to a broadband user): + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + * + * We bound the BBR-native inter-bw-probe wall clock time to be: + * (a) higher than 2 sec: to try to avoid causing loss for a long enough time + * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must + * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs + * (b) lower than 3 sec: to ensure flows can start probing in a reasonable + * amount of time to discover unutilized bw on human-scale interactive + * time-scales (e.g. perhaps traffic from a web page download that we + * were competing with is now complete). + */ +static void bbr2_pick_probe_wait(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Decide the random round-trip bound for wait until probe: */ + bbr->rounds_since_probe = + get_random_u32_below(bbr->params.bw_probe_rand_rounds); + /* Decide the random wall clock bound for wait until probe: */ + bbr->probe_wait_us = bbr->params.bw_probe_base_us + + get_random_u32_below(bbr->params.bw_probe_rand_us); +} + +static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = cycle_idx; + /* New phase, so need to update cwnd and pacing rate. */ + bbr->try_fast_path = 0; +} + +/* Send at estimated bw to fill the pipe, but not queue. We need this phase + * before PROBE_UP, because as soon as we send faster than the available bw + * we will start building a queue, and if the buffer is shallow we can cause + * loss. If we do not fill the pipe before we cause this loss, our bw_hi and + * inflight_hi estimates will underestimate. + */ +static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_lower_bounds(sk); + if (bbr->inflight_hi != ~0U) + bbr->inflight_hi += bbr->params.refill_add_inc; + bbr->bw_probe_up_rounds = bw_probe_up_rounds; + bbr->bw_probe_up_acks = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_REFILLING; + bbr->next_rtt_delivered = tp->delivered; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); +} + +/* Now probe max deliverable data rate and volume. */ +static void bbr2_start_bw_probe_up(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->ack_phase = BBR_ACKS_PROBE_STARTING; + bbr->next_rtt_delivered = tp->delivered; + bbr->cycle_mstamp = tp->tcp_mstamp; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); + bbr2_raise_inflight_hi_slope(sk); +} + +/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall + * clock time at which to probe beyond an inflight that we think to be + * safe. This will knowingly risk packet loss, so we want to do this rarely, to + * keep packet loss rates low. Also start a round-trip counter, to probe faster + * if we estimate a Reno flow at our BDP would probe faster. + */ +static void bbr2_start_bw_probe_down(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_congestion_signals(sk); + bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ + bbr2_pick_probe_wait(sk); + bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); +} + +/* Cruise: maintain what we estimate to be a neutral, conservative + * operating point, without attempting to probe up for bandwidth or down for + * RTT, and only reducing inflight in response to loss/ECN signals. + */ +static void bbr2_start_bw_probe_cruise(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->inflight_lo != ~0U) + bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); + + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); +} + +/* Loss and/or ECN rate is too high while probing. + * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. + */ +static void bbr2_handle_inflight_too_high(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + const u32 beta = bbr->params.beta; + + bbr->prev_probe_too_high = 1; + bbr->bw_probe_samples = 0; /* only react once per probe */ + bbr->debug.event = 'L'; /* Loss/ECN too high */ + /* If we are app-limited then we are not robustly + * probing the max volume of inflight data we think + * might be safe (analogous to how app-limited bw + * samples are not known to be robustly probing bw). + */ + if (!rs->is_app_limited) + bbr->inflight_hi = max_t(u32, rs->tx_in_flight, + (u64)bbr2_target_inflight(sk) * + (BBR_UNIT - beta) >> BBR_SCALE); + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr2_start_bw_probe_down(sk); +} + +/* If we're seeing bw and loss samples reflecting our bw probing, adapt + * using the signals we see. If loss or ECN mark rate gets too high, then adapt + * inflight_hi downward. If we're able to push inflight higher without such + * signals, push higher: adapt inflight_hi upward. + */ +static bool bbr2_adapt_upper_bounds(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Track when we'll see bw/loss samples resulting from our bw probes. */ + if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) + bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; + if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { + /* End of samples from bw probing phase. */ + bbr->bw_probe_samples = 0; + bbr->ack_phase = BBR_ACKS_INIT; + /* At this point in the cycle, our current bw sample is also + * our best recent chance at finding the highest available bw + * for this flow. So now is the best time to forget the bw + * samples from the previous cycle, by advancing the window. + */ + if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) + bbr2_advance_bw_hi_filter(sk); + /* If we had an inflight_hi, then probed and pushed inflight all + * the way up to hit that inflight_hi without seeing any + * high loss/ECN in all the resulting ACKs from that probing, + * then probe up again, this time letting inflight persist at + * inflight_hi for a round trip, then accelerating beyond. + */ + if (bbr->mode == BBR_PROBE_BW && + bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { + bbr->debug.event = 'R'; /* reprobe */ + bbr2_start_bw_probe_refill(sk, 0); + return true; /* yes, decided state transition */ + } + } + + if (bbr2_is_inflight_too_high(sk, rs)) { + if (bbr->bw_probe_samples) /* sample is from bw probing? */ + bbr2_handle_inflight_too_high(sk, rs); + } else { + /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ + if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ + return false; + + /* To be resilient to random loss, we must raise inflight_hi + * if we observe in any phase that a higher level is safe. + */ + if (rs->tx_in_flight > bbr->inflight_hi) { + bbr->inflight_hi = rs->tx_in_flight; + bbr->debug.event = 'U'; /* raise up inflight_hi */ + } + + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr2_probe_inflight_hi_upward(sk, rs); + } + + return false; +} + +/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ +static bool bbr2_check_time_to_probe_bw(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 n; + + /* If we seem to be at an operating point where we are not seeing loss + * but we are seeing ECN marks, then when the ECN marks cease we reprobe + * quickly (in case a burst of cross-traffic has ceased and freed up bw, + * or in case we are sharing with multiplicatively probing traffic). + */ + if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && + bbr->ecn_in_cycle && !bbr->loss_in_cycle && + inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { + bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ + /* Calculate n so that when bbr2_raise_inflight_hi_slope() + * computes growth_this_round as 2^n it will be roughly the + * desired volume of data (inflight_hi*ecn_reprobe_gain). + */ + n = ilog2((((u64)bbr->inflight_hi * + bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); + bbr2_start_bw_probe_refill(sk, n); + return true; + } + + if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || + bbr2_is_reno_coexistence_probe_time(sk)) { + bbr2_start_bw_probe_refill(sk, 0); + return true; + } + return false; +} + +/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ +static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + bool is_under_bdp, is_long_enough; + + /* Always need to pull inflight down to leave headroom in queue. */ + if (inflight > bbr2_inflight_with_headroom(sk)) + return false; + + is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); + if (bbr->params.drain_to_target) + return is_under_bdp; + + is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); + return is_under_bdp || is_long_enough; +} + +/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ +static void bbr2_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + bool is_risky = false, is_queuing = false; + u32 inflight, bw; + + if (!bbr_full_bw_reached(sk)) + return; + + /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ + if (bbr2_adapt_upper_bounds(sk, rs)) + return; /* already decided state transition */ + + if (bbr->mode != BBR_PROBE_BW) + return; + + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); + bw = bbr_max_bw(sk); + + switch (bbr->cycle_idx) { + /* First we spend most of our time cruising with a pacing_gain of 1.0, + * which paces at the estimated bw, to try to fully use the pipe + * without building queue. If we encounter loss/ECN marks, we adapt + * by slowing down. + */ + case BBR_BW_PROBE_CRUISE: + if (bbr2_check_time_to_probe_bw(sk)) + return; /* already decided state transition */ + break; + + /* After cruising, when it's time to probe, we first "refill": we send + * at the estimated bw to fill the pipe, before probing higher and + * knowingly risking overflowing the bottleneck buffer (causing loss). + */ + case BBR_BW_PROBE_REFILL: + if (bbr->round_start) { + /* After one full round trip of sending in REFILL, we + * start to see bw samples reflecting our REFILL, which + * may be putting too much data in flight. + */ + bbr->bw_probe_samples = 1; + bbr2_start_bw_probe_up(sk); + } + break; + + /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to + * probe for bw. If we have not seen loss/ECN, we try to raise inflight + * to at least pacing_gain*BDP; note that this may take more than + * min_rtt if min_rtt is small (e.g. on a LAN). + * + * We terminate PROBE_UP bandwidth probing upon any of the following: + * + * (1) We've pushed inflight up to hit the inflight_hi target set in the + * most recent previous bw probe phase. Thus we want to start + * draining the queue immediately because it's very likely the most + * recently sent packets will fill the queue and cause drops. + * (checked here) + * (2) We have probed for at least 1*min_rtt_us, and the + * estimated queue is high enough (inflight > 1.25 * estimated_bdp). + * (checked here) + * (3) Loss filter says loss rate is "too high". + * (checked in bbr_is_inflight_too_high()) + * (4) ECN filter says ECN mark rate is "too high". + * (checked in bbr_is_inflight_too_high()) + */ + case BBR_BW_PROBE_UP: + if (bbr->prev_probe_too_high && + inflight >= bbr->inflight_hi) { + bbr->stopped_risky_probe = 1; + is_risky = true; + bbr->debug.event = 'D'; /* D for danger */ + } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && + inflight >= + bbr_inflight(sk, bw, + bbr->params.bw_probe_pif_gain)) { + is_queuing = true; + bbr->debug.event = 'Q'; /* building Queue */ + } + if (is_risky || is_queuing) { + bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ + bbr2_start_bw_probe_down(sk); /* restart w/ down */ + } + break; + + /* After probing in PROBE_UP, we have usually accumulated some data in + * the bottleneck buffer (if bw probing didn't find more bw). We next + * enter PROBE_DOWN to try to drain any excess data from the queue. To + * do this, we use a pacing_gain < 1.0. We hold this pacing gain until + * our inflight is less then that target cruising point, which is the + * minimum of (a) the amount needed to leave headroom, and (b) the + * estimated BDP. Once inflight falls to match the target, we estimate + * the queue is drained; persisting would underutilize the pipe. + */ + case BBR_BW_PROBE_DOWN: + if (bbr2_check_time_to_probe_bw(sk)) + return; /* already decided state transition */ + if (bbr2_check_time_to_cruise(sk, inflight, bw)) + bbr2_start_bw_probe_cruise(sk); + break; + + default: + WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); + } +} + +/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ +static void bbr2_exit_probe_rtt(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_lower_bounds(sk); + if (bbr_full_bw_reached(sk)) { + bbr->mode = BBR_PROBE_BW; + /* Raising inflight after PROBE_RTT may cause loss, so reset + * the PROBE_BW clock and schedule the next bandwidth probe for + * a friendly and randomized future point in time. + */ + bbr2_start_bw_probe_down(sk); + /* Since we are exiting PROBE_RTT, we know inflight is + * below our estimated BDP, so it is reasonable to cruise. + */ + bbr2_start_bw_probe_cruise(sk); + } else { + bbr->mode = BBR_STARTUP; + } +} + +/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until + * the end of the round in recovery to get a good estimate of how many packets + * have been lost, and how many we need to drain with a low pacing rate. + */ +static void bbr2_check_loss_too_high_in_startup(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk)) + return; + + /* For STARTUP exit, check the loss rate at the end of each round trip + * of Recovery episodes in STARTUP. We check the loss rate at the end + * of the round trip to filter out noisy/low loss and have a better + * sense of inflight (extent of loss), so we can drain more accurately. + */ + if (rs->losses && bbr->loss_events_in_round < 0xf) + bbr->loss_events_in_round++; /* update saturating counter */ + if (bbr->params.full_loss_cnt && bbr->loss_round_start && + inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && + bbr->loss_events_in_round >= bbr->params.full_loss_cnt && + bbr2_is_inflight_too_high(sk, rs)) { + bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ + bbr2_handle_queue_too_high_in_startup(sk); + return; + } + if (bbr->loss_round_start) + bbr->loss_events_in_round = 0; +} + +/* If we are done draining, advance into steady state operation in PROBE_BW. */ +static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_check_drain(sk, rs, ctx)) { + bbr->mode = BBR_PROBE_BW; + bbr2_start_bw_probe_down(sk); + } +} + +static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + bbr2_update_congestion_signals(sk, rs, ctx); + bbr_update_ack_aggregation(sk, rs); + bbr2_check_loss_too_high_in_startup(sk, rs); + bbr_check_full_bw_reached(sk, rs); + bbr2_check_drain(sk, rs, ctx); + bbr2_update_cycle_phase(sk, rs); + bbr_update_min_rtt(sk, rs); +} + +/* Fast path for app-limited case. + * + * On each ack, we execute bbr state machine, which primarily consists of: + * 1) update model based on new rate sample, and + * 2) update control based on updated model or state change. + * + * There are certain workload/scenarios, e.g. app-limited case, where + * either we can skip updating model or we can skip update of both model + * as well as control. This provides signifcant softirq cpu savings for + * processing incoming acks. + * + * In case of app-limited, if there is no congestion (loss/ecn) and + * if observed bw sample is less than current estimated bw, then we can + * skip some of the computation in bbr state processing: + * + * - if there is no rtt/mode/phase change: In this case, since all the + * parameters of the network model are constant, we can skip model + * as well control update. + * + * - else we can skip rest of the model update. But we still need to + * update the control to account for the new rtt/mode/phase. + * + * Returns whether we can take fast path or not. + */ +static bool bbr2_fast_path(struct sock *sk, bool *update_model, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 prev_min_rtt_us, prev_mode; + + if (bbr->params.fast_path && bbr->try_fast_path && + rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && + !bbr->loss_in_round && !bbr->ecn_in_round) { + prev_mode = bbr->mode; + prev_min_rtt_us = bbr->min_rtt_us; + bbr2_check_drain(sk, rs, ctx); + bbr2_update_cycle_phase(sk, rs); + bbr_update_min_rtt(sk, rs); + + if (bbr->mode == prev_mode && + bbr->min_rtt_us == prev_min_rtt_us && + bbr->try_fast_path) + return true; + + /* Skip model update, but control still needs to be updated */ + *update_model = false; + } + return false; +} + +static void bbr2_main(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct bbr_context ctx = { 0 }; + bool update_model = true; + u32 bw; + + bbr->debug.event = '.'; /* init to default NOP (no event yet) */ + + bbr_update_round_start(sk, rs, &ctx); + if (bbr->round_start) { + bbr->rounds_since_probe = + min_t(s32, bbr->rounds_since_probe + 1, 0xFF); + bbr2_update_ecn_alpha(sk); + } + + bbr->ecn_in_round |= rs->is_ece; + bbr_calculate_bw_sample(sk, rs, &ctx); + + if (bbr2_fast_path(sk, &update_model, rs, &ctx)) + goto out; + + if (update_model) + bbr2_update_model(sk, rs, &ctx); + + bbr_update_gains(sk); + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, + tp->snd_cwnd, &ctx); + bbr2_bound_cwnd_for_inflight_model(sk); + +out: + bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; + bbr->loss_in_cycle |= rs->lost > 0; + bbr->ecn_in_cycle |= rs->delivered_ce > 0; + + bbr_debug(sk, rs->acked_sacked, rs, &ctx); +} + +/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared + * down here, so that the algorithm functions that use the parameters must use + * the per-socket parameters; if they accidentally use the global version + * then there will be a compile error. + * TODO(ncardwell): move all per-socket parameters down to this section. + */ + +/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. + * No loss response when 0. Max allwed value is 255. + */ +static u32 bbr_beta = BBR_UNIT * 30 / 100; + +/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. + * Max allowed value is 255. + */ +static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ + +/* The initial value for the ecn_alpha state variable. Default and max + * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly + * to congestion if the bottleneck is congested when the flow starts up. + */ +static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ + +/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. + * No ECN based bounding when 0. Max allwed value is 255. + */ +static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ + +/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. + * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. + */ +static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ + +/* Max RTT (in usec) at which to use sender-side ECN logic. + * Disabled when 0 (ECN allowed at any RTT). + * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. + */ +static u32 bbr_ecn_max_rtt_us = 5000; + +/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN + * clears then use a multiplicative increase to quickly reprobe bw by + * starting inflight probing at the given multiple of inflight_hi. + * Default for this experimental knob is 0 (disabled). + * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. + */ +static u32 bbr_ecn_reprobe_gain; + +/* Estimate bw probing has gone too far if loss rate exceeds this level. */ +static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ + +/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, + * and loss rate is higher than bbr_loss_thresh. + * Disabled if 0. Max allowed value is 15 (0xF). + */ +static u32 bbr_full_loss_cnt = 8; + +/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh + * meets this count. Max allowed value is 3. + */ +static u32 bbr_full_ecn_cnt = 2; + +/* Fraction of unutilized headroom to try to leave in path upon high loss. */ +static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; + +/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. + * Default is 1.25x, as in BBR v1. Max allowed is 511. + */ +static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; + +/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. + * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. + * Max allowed is 511. + */ +static u32 bbr_bw_probe_reno_gain = BBR_UNIT; + +/* Max number of packet-timed rounds to wait before probing for bandwidth. If + * we want to tolerate 1% random loss per round, and not have this cut our + * inflight too much, we must probe for bw periodically on roughly this scale. + * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. + * We aim to be fair with Reno/CUBIC up to a BDP of at least: + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + */ +static u32 bbr_bw_probe_max_rounds = 63; + +/* Max amount of randomness to inject in round counting for Reno-coexistence. + * Max value is 15. + */ +static u32 bbr_bw_probe_rand_rounds = 2; + +/* Use BBR-native probe time scale starting at this many usec. + * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: + * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs + */ +static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ + +/* Use BBR-native probes spread over this many usec: */ +static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ + +/* Undo the model changes made in loss recovery if recovery was spurious? */ +static bool bbr_undo = true; + +/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ +static bool bbr_fast_path = true; /* default: enabled */ + +/* Use fast ack mode ? */ +static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ + +/* How much to additively increase inflight_hi when entering REFILL? */ +static u32 bbr_refill_add_inc; /* default: disabled */ + +module_param_named(beta, bbr_beta, uint, 0644); +module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); +module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); +module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); +module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); +module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); +module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); +module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); +module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); +module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); +module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); +module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); +module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); +module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); +module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); +module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); +module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); +module_param_named(undo, bbr_undo, bool, 0664); +module_param_named(fast_path, bbr_fast_path, bool, 0664); +module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); +module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); + +static void bbr2_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr_init(sk); /* run shared init code for v1 and v2 */ + + /* BBR v2 parameters: */ + bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); + bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); + bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); + bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); + bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); + bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); + bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); + bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); + bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); + bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); + bbr->params.inflight_headroom = + min_t(u32, 0xFFU, bbr_inflight_headroom); + bbr->params.bw_probe_pif_gain = + min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); + bbr->params.bw_probe_reno_gain = + min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); + bbr->params.bw_probe_max_rounds = + min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); + bbr->params.bw_probe_rand_rounds = + min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); + bbr->params.bw_probe_base_us = + min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); + bbr->params.bw_probe_rand_us = + min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); + bbr->params.undo = bbr_undo; + bbr->params.fast_path = bbr_fast_path ? 1 : 0; + bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); + + /* BBR v2 state: */ + bbr->initialized = 1; + /* Start sampling ECN mark rate after first full flight is ACKed: */ + bbr->loss_round_delivered = tp->delivered + 1; + bbr->loss_round_start = 0; + bbr->undo_bw_lo = 0; + bbr->undo_inflight_lo = 0; + bbr->undo_inflight_hi = 0; + bbr->loss_events_in_round = 0; + bbr->startup_ecn_rounds = 0; + bbr2_reset_congestion_signals(sk); + bbr->bw_lo = ~0U; + bbr->bw_hi[0] = 0; + bbr->bw_hi[1] = 0; + bbr->inflight_lo = ~0U; + bbr->inflight_hi = ~0U; + bbr->bw_probe_up_cnt = ~0U; + bbr->bw_probe_up_acks = 0; + bbr->bw_probe_up_rounds = 0; + bbr->probe_wait_us = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_INIT; + bbr->rounds_since_probe = 0; + bbr->bw_probe_samples = 0; + bbr->prev_probe_too_high = 0; + bbr->ecn_eligible = 0; + bbr->ecn_alpha = bbr->params.ecn_alpha_init; + bbr->alpha_last_delivered = 0; + bbr->alpha_last_delivered_ce = 0; + + tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); + + if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable) + tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; +} + +/* Core TCP stack informs us that the given skb was just marked lost. */ +static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct rate_sample rs; + + /* Capture "current" data over the full round trip of loss, + * to have a better chance to see the full capacity of the path. + */ + if (!bbr->loss_in_round) /* first loss in this round trip? */ + bbr->loss_round_delivered = tp->delivered; /* set round trip */ + bbr->loss_in_round = 1; + bbr->loss_in_cycle = 1; + + if (!bbr->bw_probe_samples) + return; /* not an skb sent while probing for bandwidth */ + if (unlikely(!scb->tx.delivered_mstamp)) + return; /* skb was SACKed, reneged, marked lost; ignore it */ + /* We are probing for bandwidth. Construct a rate sample that + * estimates what happened in the flight leading up to this lost skb, + * then see if the loss rate went too high, and if so at which packet. + */ + memset(&rs, 0, sizeof(rs)); + rs.tx_in_flight = scb->tx.in_flight; + rs.lost = tp->lost - scb->tx.lost; + rs.is_app_limited = scb->tx.is_app_limited; + if (bbr2_is_inflight_too_high(sk, &rs)) { + rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); + bbr2_handle_inflight_too_high(sk, &rs); + } +} + +/* Revert short-term model if current loss recovery event was spurious. */ +static u32 bbr2_undo_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->debug.undo = 1; + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; + bbr->loss_in_round = 0; + + if (!bbr->params.undo) + return tp->snd_cwnd; + + /* Revert to cwnd and other state saved before loss episode. */ + bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); + bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); + bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); + return bbr->prior_cwnd; +} + +/* Entering loss recovery, so save state for when we undo recovery. */ +static u32 bbr2_ssthresh(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr_save_cwnd(sk); + /* For undo, save state that adapts based on loss signal. */ + bbr->undo_bw_lo = bbr->bw_lo; + bbr->undo_inflight_lo = bbr->inflight_lo; + bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; +} + +static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) +{ + switch (bbr->mode) { + case BBR_STARTUP: + return BBR2_PHASE_STARTUP; + case BBR_DRAIN: + return BBR2_PHASE_DRAIN; + case BBR_PROBE_BW: + break; + case BBR_PROBE_RTT: + return BBR2_PHASE_PROBE_RTT; + default: + return BBR2_PHASE_INVALID; + } + switch (bbr->cycle_idx) { + case BBR_BW_PROBE_UP: + return BBR2_PHASE_PROBE_BW_UP; + case BBR_BW_PROBE_DOWN: + return BBR2_PHASE_PROBE_BW_DOWN; + case BBR_BW_PROBE_CRUISE: + return BBR2_PHASE_PROBE_BW_CRUISE; + case BBR_BW_PROBE_REFILL: + return BBR2_PHASE_PROBE_BW_REFILL; + default: + return BBR2_PHASE_INVALID; + } +} + +static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); + u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); + u64 bw_lo = bbr->bw_lo == ~0U ? + ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); + + memset(&info->bbr2, 0, sizeof(info->bbr2)); + info->bbr2.bbr_bw_lsb = (u32)bw; + info->bbr2.bbr_bw_msb = (u32)(bw >> 32); + info->bbr2.bbr_min_rtt = bbr->min_rtt_us; + info->bbr2.bbr_pacing_gain = bbr->pacing_gain; + info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; + info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; + info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); + info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; + info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); + info->bbr2.bbr_mode = bbr->mode; + info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); + info->bbr2.bbr_version = (__u8)2; + info->bbr2.bbr_inflight_lo = bbr->inflight_lo; + info->bbr2.bbr_inflight_hi = bbr->inflight_hi; + info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; + return sizeof(info->bbr2); + } + return 0; +} + +static void bbr2_set_state(struct sock *sk, u8 new_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + struct rate_sample rs = { .losses = 1 }; + struct bbr_context ctx = { 0 }; + + bbr->prev_ca_state = TCP_CA_Loss; + bbr->full_bw = 0; + if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { + /* bbr_adapt_lower_bounds() needs cwnd before + * we suffered an RTO, to update inflight_lo: + */ + bbr->inflight_lo = + max(tp->snd_cwnd, bbr->prior_cwnd); + } + bbr_debug(sk, 0, &rs, &ctx); + } else if (bbr->prev_ca_state == TCP_CA_Loss && + new_state != TCP_CA_Loss) { + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr->try_fast_path = 0; /* bound cwnd using latest model */ + } +} + +static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { + .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr2", + .owner = THIS_MODULE, + .init = bbr2_init, + .cong_control = bbr2_main, + .sndbuf_expand = bbr_sndbuf_expand, + .skb_marked_lost = bbr2_skb_marked_lost, + .undo_cwnd = bbr2_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr2_ssthresh, + .tso_segs = bbr_tso_segs, + .get_info = bbr2_get_info, + .set_state = bbr2_set_state, +}; + +static int __init bbr_register(void) +{ + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_bbr2_cong_ops); +} + +static void __exit bbr_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); +} + +module_init(bbr_register); +module_exit(bbr_unregister); + +MODULE_AUTHOR("Van Jacobson "); +MODULE_AUTHOR("Neal Cardwell "); +MODULE_AUTHOR("Yuchung Cheng "); +MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_AUTHOR("Priyaranjan Jha "); +MODULE_AUTHOR("Yousuk Seung "); +MODULE_AUTHOR("Kevin Yang "); +MODULE_AUTHOR("Arjun Roy "); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d3cae40749e8..0f268f2ff2e9 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; + tcp_sk(sk)->fast_ack_mode = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc072d2cfcd8..754e0212c951 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { @@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; @@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { + struct sock *sk = (struct sock *)tp; + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tp->lost += tcp_skb_pcount(skb); + if (ca_ops->skb_marked_lost) + ca_ops->skb_marked_lost(sk, skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) @@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); + /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ + if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, + "prev in_flight: %u skb in_flight: %u pcount: %u", + TCP_SKB_CB(prev)->tx.in_flight, + TCP_SKB_CB(skb)->tx.in_flight, + pcount)) + TCP_SKB_CB(skb)->tx.in_flight = 0; + else + TCP_SKB_CB(skb)->tx.in_flight -= pcount; + TCP_SKB_CB(prev)->tx.in_flight += pcount; + /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep @@ -3813,6 +3829,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); + tcp_rate_check_app_limited(sk); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3911,6 +3928,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); + rs.is_ece = !!(flag & FLAG_ECE); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); @@ -5521,13 +5539,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && + (tp->fast_ack_mode == 1 || /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ - (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || - __tcp_select_window(sk) >= tp->rcv_wnd)) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd))) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ba839e441450..5ffec885e66f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else if (!tcp_ca_needs_ecn(sk)) { + } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && + !tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -1530,7 +1531,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize, old_factor; + int nsize, old_factor, inflight_prev; long limit; int nlen; u8 flags; @@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (diff) tcp_adjust_pcount(sk, skb, diff); + + /* Set buff tx.in_flight as if buff were sent by itself. */ + inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; + if (WARN_ONCE(inflight_prev < 0, + "inconsistent: tx.in_flight: %u old_factor: %d", + TCP_SKB_CB(skb)->tx.in_flight, old_factor)) + inflight_prev = 0; + TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + + tcp_skb_pcount(buff); } /* Link BUFF into the send queue. */ @@ -1990,13 +2000,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 min_tso, tso_segs; - - min_tso = ca_ops->min_tso_segs ? - ca_ops->min_tso_segs(sk) : - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + u32 tso_segs; - tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); + tso_segs = ca_ops->tso_segs ? + ca_ops->tso_segs(sk, mss_now) : + tcp_tso_autosize(sk, mss_now, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs)); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } @@ -2632,6 +2641,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); + tcp_set_tx_in_flight(sk, skb); goto repair; /* Skip network transmission */ } diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index a8f6d9d06f2e..a8b4c9504570 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ * ready to send in the write queue. */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 in_flight; + + /* Check, sanitize, and record packets in flight after skb was sent. */ + in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); + if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, + "insane in_flight %u cc %s mss %u " + "cwnd %u pif %u %u %u %u\n", + in_flight, inet_csk(sk)->icsk_ca_ops->name, + tp->mss_cache, tp->snd_cwnd, + tp->packets_out, tp->retrans_out, + tp->sacked_out, tp->lost_out)) + in_flight = TCPCB_IN_FLIGHT_MAX; + TCP_SKB_CB(skb)->tx.in_flight = in_flight; +} + /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.lost = tp->lost; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + tcp_set_tx_in_flight(sk, skb); } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, scb->end_seq, rs->last_end_seq)) { + rs->prior_lost = scb->tx.lost; rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->last_end_seq = scb->end_seq; + rs->tx_in_flight = scb->tx.in_flight; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp32_us_delta( + tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being @@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, return; } rs->delivered = tp->delivered - rs->prior_delivered; + rs->lost = tp->lost - rs->prior_lost; rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; /* delivered_ce occupies less than 32 bits in the skb control block */ @@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index cb79127f45c3..70e4de876a7f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -605,6 +605,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } + tcp_rate_check_app_limited(sk); tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; -- 2.40.0 From 5610d33c45f6785da3a9b856d1fee11fb06c78e8 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 13 Apr 2023 18:13:28 +0200 Subject: [PATCH 02/15] bfq Signed-off-by: Peter Jung --- block/bfq-cgroup.c | 101 ++++--- block/bfq-iosched.c | 637 ++++++++++++++++++++++++++++-------------- block/bfq-iosched.h | 144 ++++++++-- block/bfq-wf2q.c | 2 +- block/blk-cgroup.c | 122 ++++---- block/blk-cgroup.h | 10 +- block/blk-iocost.c | 58 ++-- block/blk-iolatency.c | 39 ++- block/blk-rq-qos.h | 2 +- block/blk-throttle.c | 16 +- block/blk.h | 6 - 11 files changed, 747 insertions(+), 390 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0fbde0fc0628..59929dfd559b 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -706,12 +706,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_activate_bfqq(bfqd, bfqq); } - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } +static void bfq_sync_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *sync_bfqq, + struct bfq_io_cq *bic, + struct bfq_group *bfqg, + unsigned int act_idx) +{ + struct bfq_queue *bfqq; + + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + return; + } + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is not valid + * anymore. We cannot easily just cancel the merge (by + * clearing new_bfqq) as there may be other processes + * using this queue and holding refs to all queues + * below sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from bfqq now + * so that we cannot merge bio to a request from the + * old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bic_set_bfqq(bic, NULL, true, act_idx); + bfq_release_process_ref(bfqd, sync_bfqq); + } +} + /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. @@ -726,53 +766,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); - struct bfq_entity *entity; + unsigned int act_idx; - if (async_bfqq) { - entity = &async_bfqq->entity; + for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, false); + if (async_bfqq && + async_bfqq->entity.sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } - } - if (sync_bfqq) { - if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { - /* We are the only user of this bfqq, just move it */ - if (sync_bfqq->entity.sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); - } else { - struct bfq_queue *bfqq; - - /* - * The queue was merged to a different queue. Check - * that the merge chain still belongs to the same - * cgroup. - */ - for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) - if (bfqq->entity.sched_data != - &bfqg->sched_data) - break; - if (bfqq) { - /* - * Some queue changed cgroup so the merge is - * not valid anymore. We cannot easily just - * cancel the merge (by clearing new_bfqq) as - * there may be other processes using this - * queue and holding refs to all queues below - * sync_bfqq->new_bfqq. Similarly if the merge - * already happened, we need to detach from - * bfqq now so that we cannot merge bio to a - * request from the old cgroup. - */ - bfq_put_cooperator(sync_bfqq); - bic_set_bfqq(bic, NULL, true); - bfq_release_process_ref(bfqd, sync_bfqq); - } - } + if (sync_bfqq) + bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } } @@ -1106,9 +1113,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, struct bfq_group *bfqg; u64 v; - ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) - return ret; + goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ @@ -1130,7 +1139,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, ret = 0; } out: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 380e9bda2e57..aa644973d260 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600; #define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - return bic->bfqq[is_sync]; + if (is_sync) + return bic->bfqq[1][actuator_idx]; + + return bic->bfqq[0][actuator_idx]; } static void bfq_put_stable_ref(struct bfq_queue *bfqq); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) +void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, + bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; - - /* Clear bic pointer if bfqq is detached from this bic */ - if (old_bfqq && old_bfqq->bic == bic) - old_bfqq->bic = NULL; + struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx]; /* * If bfqq != NULL, then a non-stable queue merge between @@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) * we cancel the stable merge if * bic->stable_merge_bfqq == bfqq. */ - bic->bfqq[is_sync] = bfqq; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx]; + + /* Clear bic pointer if bfqq is detached from this bic */ + if (old_bfqq && old_bfqq->bic == bic) + old_bfqq->bic = NULL; - if (bfqq && bic->stable_merge_bfqq == bfqq) { + if (is_sync) + bic->bfqq[1][actuator_idx] = bfqq; + else + bic->bfqq[0][actuator_idx] = bfqq; + + if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) { /* * Actually, these same instructions are executed also * in bfq_setup_cooperator, in case of abort or actual @@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) * did so, we would nest even more complexity in this * function. */ - bfq_put_stable_ref(bic->stable_merge_bfqq); + bfq_put_stable_ref(bfqq_data->stable_merge_bfqq); - bic->stable_merge_bfqq = NULL; + bfqq_data->stable_merge_bfqq = NULL; } } @@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL; int depth; unsigned limit = data->q->nr_requests; + unsigned int act_idx; /* Sync reads have full depth available */ if (op_is_sync(opf) && !op_is_write(opf)) { @@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) limit = (limit * depth) >> bfqd->full_depth_shift; } - /* - * Does queue (or any parent entity) exceed number of requests that - * should be available to it? Heavily limit depth so that it cannot - * consume more available requests and thus starve other entities. - */ - if (bfqq && bfqq_request_over_limit(bfqq, limit)) - depth = 1; + for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *bfqq = + bic_to_bfqq(bic, op_is_sync(opf), act_idx); + /* + * Does queue (or any parent entity) exceed number of + * requests that should be available to it? Heavily + * limit depth so that it cannot consume more + * available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) { + depth = 1; + break; + } + } bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); if (depth) @@ -1074,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - dur = bfqd->rate_dur_prod; do_div(dur, bfqd->peak_rate); @@ -1118,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, { unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - if (bic->saved_has_short_ttime) + if (bfqq_data->saved_has_short_ttime) bfq_mark_bfqq_has_short_ttime(bfqq); else bfq_clear_bfqq_has_short_ttime(bfqq); - if (bic->saved_IO_bound) + if (bfqq_data->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; - bfqq->inject_limit = bic->saved_inject_limit; - bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns; + bfqq->inject_limit = bfqq_data->saved_inject_limit; + bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif; - bfqq->entity.new_weight = bic->saved_weight; - bfqq->ttime = bic->saved_ttime; - bfqq->io_start_time = bic->saved_io_start_time; - bfqq->tot_idle_time = bic->saved_tot_idle_time; + bfqq->entity.new_weight = bfqq_data->saved_weight; + bfqq->ttime = bfqq_data->saved_ttime; + bfqq->io_start_time = bfqq_data->saved_io_start_time; + bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time; /* * Restore weight coefficient only if low_latency is on */ if (bfqd->low_latency) { old_wr_coeff = bfqq->wr_coeff; - bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_coeff = bfqq_data->saved_wr_coeff; } - bfqq->service_from_wr = bic->saved_service_from_wr; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + bfqq->service_from_wr = bfqq_data->saved_service_from_wr; + bfqq->wr_start_at_switch_to_srt = + bfqq_data->saved_wr_start_at_switch_to_srt; + bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time; if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + @@ -1766,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +/* + * Get the index of the actuator that will serve bio. + */ +static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) +{ + unsigned int i; + sector_t end; + + /* no search needed if one or zero ranges present */ + if (bfqd->num_actuators == 1) + return 0; + + /* bio_end_sector(bio) gives the sector after the last one */ + end = bio_end_sector(bio) - 1; + + for (i = 0; i < bfqd->num_actuators; i++) { + if (end >= bfqd->sector[i] && + end < bfqd->sector[i] + bfqd->nr_sectors[i]) + return i; + } + + WARN_ONCE(true, + "bfq_actuator_index: bio sector out of ranges: end=%llu\n", + end); + return 0; +} + static bool bfq_better_to_idle(struct bfq_queue *bfqq); static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, @@ -1785,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, arrived_in_time = ktime_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; - + unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); + bool bfqq_non_merged_or_stably_merged = + bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged; /* * bfqq deserves to be weight-raised if: @@ -1819,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || - (bfq_bfqq_sync(bfqq) && - (bfqq->bic || RQ_BIC(rq)->stably_merged) && - (*interactive || soft_rt))); + (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq @@ -2098,7 +2145,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, * We reset waker detection logic also if too much time has passed * since the first detection. If wakeups are rare, pointless idling * doesn't hurt throughput that much. The condition below makes sure - * we do not uselessly idle blocking waker in more than 1/64 cases. + * we do not uselessly idle blocking waker in more than 1/64 cases. */ if (bfqd->last_completed_rq_bfqq != bfqq->tentative_waker_bfqq || @@ -2209,9 +2256,9 @@ static void bfq_add_request(struct request *rq) * elapsed. */ if (bfqq == bfqd->in_service_queue && - (bfqd->rq_in_driver == 0 || + (bfqd->tot_rq_in_driver == 0 || (bfqq->last_serv_time_ns > 0 && - bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && + bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); @@ -2235,7 +2282,7 @@ static void bfq_add_request(struct request *rq) * will be set in case injection is performed * on bfqq before rq is completed). */ - if (bfqd->rq_in_driver == 0) + if (bfqd->tot_rq_in_driver == 0) bfqd->rqs_injected = false; } } @@ -2418,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, */ bfq_bic_update_cgroup(bic, bio); - bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf), + bfq_actuator_index(bfqd, bio)); } else { bfqd->bio_bfqq = NULL; } @@ -2584,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_NR_LEVELS; j++) - if (bfqg->async_bfqq[i][j]) - bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) - bfq_bfqq_end_wr(bfqg->async_idle_bfqq); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + if (bfqg->async_bfqq[i][j][k]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]); + if (bfqg->async_idle_bfqq[k]) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]); + } } static void bfq_end_wr(struct bfq_data *bfqd) { struct bfq_queue *bfqq; + int i; spin_lock_irq(&bfqd->lock); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + bfq_bfqq_end_wr(bfqq); + } list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) bfq_bfqq_end_wr(bfqq); bfq_end_wr_async(bfqd); @@ -2794,6 +2847,40 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq); +static struct bfq_queue * +bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_queue *stable_merge_bfqq, + struct bfq_iocq_bfqq_data *bfqq_data) +{ + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + struct bfq_queue *new_bfqq = NULL; + + bfqq_data->stable_merge_bfqq = NULL; + if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0) + goto out; + + /* next function will take at least one ref */ + new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); + + if (new_bfqq) { + bfqq_data->stably_merged = true; + if (new_bfqq->bic) { + unsigned int new_a_idx = new_bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *new_bfqq_data = + &new_bfqq->bic->bfqq_data[new_a_idx]; + + new_bfqq_data->stably_merged = true; + } + } + +out: + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + + return new_bfqq; +} + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2819,6 +2906,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* if a merge has already been setup, then proceed with that first */ if (bfqq->new_bfqq) @@ -2840,37 +2929,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * stable merging) also if bic is associated with a * sync queue, but this bfqq is async */ - if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && + if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(bfq_late_stable_merging)) && time_is_before_jiffies(bfqq->creation_time + msecs_to_jiffies(bfq_late_stable_merging))) { struct bfq_queue *stable_merge_bfqq = - bic->stable_merge_bfqq; - int proc_ref = min(bfqq_process_refs(bfqq), - bfqq_process_refs(stable_merge_bfqq)); - - /* deschedule stable merge, because done or aborted here */ - bfq_put_stable_ref(stable_merge_bfqq); - - bic->stable_merge_bfqq = NULL; - - if (!idling_boosts_thr_without_issues(bfqd, bfqq) && - proc_ref > 0) { - /* next function will take at least one ref */ - struct bfq_queue *new_bfqq = - bfq_setup_merge(bfqq, stable_merge_bfqq); - - if (new_bfqq) { - bic->stably_merged = true; - if (new_bfqq->bic) - new_bfqq->bic->stably_merged = - true; - } - return new_bfqq; - } else - return NULL; + bfqq_data->stable_merge_bfqq; + + return bfq_setup_stable_merge(bfqd, bfqq, + stable_merge_bfqq, + bfqq_data); } } @@ -2965,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { struct bfq_io_cq *bic = bfqq->bic; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* * If !bfqq->bic, the queue is already shared or its requests @@ -2974,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; - bic->saved_inject_limit = bfqq->inject_limit; - bic->saved_decrease_time_jif = bfqq->decrease_time_jif; - - bic->saved_weight = bfqq->entity.orig_weight; - bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_io_start_time = bfqq->io_start_time; - bic->saved_tot_idle_time = bfqq->tot_idle_time; - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bfqq_data->saved_inject_limit = bfqq->inject_limit; + bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif; + + bfqq_data->saved_weight = bfqq->entity.orig_weight; + bfqq_data->saved_ttime = bfqq->ttime; + bfqq_data->saved_has_short_ttime = + bfq_bfqq_has_short_ttime(bfqq); + bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq_data->saved_io_start_time = bfqq->io_start_time; + bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time; + bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq_data->was_in_burst_list = + !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && !bfq_bfqq_in_large_burst(bfqq) && bfqq->bfqd->low_latency)) { @@ -2998,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) * to bfqq, so that to avoid that bfqq unjustly fails * to enjoy weight raising if split soon. */ - bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); - bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); - bic->saved_last_wr_start_finish = jiffies; + bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = + bfq_smallest_from_now(); + bfqq_data->saved_wr_cur_max_time = + bfq_wr_duration(bfqq->bfqd); + bfqq_data->saved_last_wr_start_finish = jiffies; } else { - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = + bfqq_data->saved_wr_coeff = bfqq->wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_service_from_wr = bfqq->service_from_wr; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + bfqq_data->saved_service_from_wr = + bfqq->service_from_wr; + bfqq_data->saved_last_wr_start_finish = + bfqq->last_wr_start_finish; + bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } } @@ -3114,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, /* * Merge queues (that is, let bic redirect its requests to new_bfqq) */ - bic_set_bfqq(bic, new_bfqq, true); + bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx); bfq_mark_bfqq_coop(new_bfqq); /* * new_bfqq now belongs to at least two bics (it is a shared queue): @@ -3532,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) * - start a new observation interval with this dispatch */ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) + bfqd->tot_rq_in_driver == 0) goto update_rate_and_reset; /* Update sampling information */ bfqd->peak_rate_samples++; - if ((bfqd->rq_in_driver > 0 || + if ((bfqd->tot_rq_in_driver > 0 || now_ns - bfqd->last_completion < BFQ_MIN_TT) && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) bfqd->sequential_samples++; @@ -3803,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, return false; return (bfqq->wr_coeff > 1 && - (bfqd->wr_busy_queues < - tot_busy_queues || - bfqd->rq_in_driver >= - bfqq->dispatched + 4)) || + (bfqd->wr_busy_queues < tot_busy_queues || + bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) || bfq_asymmetric_scenario(bfqd, bfqq) || tot_busy_queues == 1; } @@ -4072,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * function to evaluate the I/O speed of a process. */ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason, - unsigned long *delta_ms) + bool compensate, unsigned long *delta_ms) { ktime_t delta_ktime; u32 delta_usecs; @@ -4269,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, /* * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta); /* * As above explained, charge slow (typically seeky) and @@ -4577,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) { struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; unsigned int limit = in_serv_bfqq->inject_limit; + int i; + /* * If * - bfqq is not weight-raised and therefore does not carry @@ -4608,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ) limit = 1; - if (bfqd->rq_in_driver >= limit) + if (bfqd->tot_rq_in_driver >= limit) return NULL; /* @@ -4623,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * (and re-added only if it gets new requests, but then it * is assigned again enough budget for its new backlog). */ - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - if (!RB_EMPTY_ROOT(&bfqq->sort_list) && - (in_serv_always_inject || bfqq->wr_coeff > 1) && - bfq_serv_to_charge(bfqq->next_rq, bfqq) <= - bfq_bfqq_budget_left(bfqq)) { + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + (in_serv_always_inject || bfqq->wr_coeff > 1) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { /* * Allow for only one large in-flight request * on non-rotational devices, for the @@ -4647,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) */ if (blk_queue_nonrot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= - BFQQ_SECT_THR_NONROT) - limit = min_t(unsigned int, 1, limit); - else - limit = in_serv_bfqq->inject_limit; - - if (bfqd->rq_in_driver < limit) { + BFQQ_SECT_THR_NONROT && + bfqd->tot_rq_in_driver >= 1) + continue; + else { bfqd->rqs_injected = true; return bfqq; } } + } return NULL; } +static struct bfq_queue * +bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) +{ + struct bfq_queue *bfqq; + + if (bfqd->in_service_queue && + bfqd->in_service_queue->actuator_idx == idx) + return bfqd->in_service_queue; + + list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) { + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { + return bfqq; + } + } + + return NULL; +} + +/* + * Perform a linear scan of each actuator, until an actuator is found + * for which the following three conditions hold: the load of the + * actuator is below the threshold (see comments on + * actuator_load_threshold for details) and lower than that of the + * next actuator (comments on this extra condition below), and there + * is a queue that contains I/O for that actuator. On success, return + * that queue. + * + * Performing a plain linear scan entails a prioritization among + * actuators. The extra condition above breaks this prioritization and + * tends to distribute injection uniformly across actuators. + */ +static struct bfq_queue * +bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) +{ + int i; + + for (i = 0 ; i < bfqd->num_actuators; i++) { + if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold && + (i == bfqd->num_actuators - 1 || + bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) { + struct bfq_queue *bfqq = + bfq_find_active_bfqq_for_actuator(bfqd, i); + + if (bfqq) + return bfqq; + } + } + + return NULL; +} + + /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. */ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq, *inject_bfqq; struct request *next_rq; enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; @@ -4689,6 +4821,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) goto expire; check_queue: + /* + * If some actuator is underutilized, but the in-service + * queue does not contain I/O for that actuator, then try to + * inject I/O for that actuator. + */ + inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd); + if (inject_bfqq && inject_bfqq != bfqq) + return inject_bfqq; + /* * This loop is rarely executed more than once. Even when it * happens, it is much more convenient to re-execute this loop @@ -4748,11 +4889,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { - struct bfq_queue *async_bfqq = - bfqq->bic && bfqq->bic->bfqq[0] && - bfq_bfqq_busy(bfqq->bic->bfqq[0]) && - bfqq->bic->bfqq[0]->next_rq ? - bfqq->bic->bfqq[0] : NULL; + unsigned int act_idx = bfqq->actuator_idx; + struct bfq_queue *async_bfqq = NULL; struct bfq_queue *blocked_bfqq = !hlist_empty(&bfqq->woken_list) ? container_of(bfqq->woken_list.first, @@ -4760,6 +4898,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) woken_list_node) : NULL; + if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] && + bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) && + bfqq->bic->bfqq[0][act_idx]->next_rq) + async_bfqq = bfqq->bic->bfqq[0][act_idx]; /* * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to @@ -4844,7 +4986,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) - bfqq = bfqq->bic->bfqq[0]; + bfqq = async_bfqq; else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && bfqq->waker_bfqq->next_rq && @@ -4975,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_dispatch_remove(bfqd->queue, rq); if (bfqq != bfqd->in_service_queue) - goto return_rq; + return rq; /* * If weight raising has to terminate for bfqq, then next @@ -4995,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) - goto return_rq; - - bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); -return_rq: return rq; } @@ -5043,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * We exploit the bfq_finish_requeue_request hook to - * decrement rq_in_driver, but + * decrement tot_rq_in_driver, but * bfq_finish_requeue_request will not be invoked on * this request. So, to avoid unbalance, just start - * this request, without incrementing rq_in_driver. As - * a negative consequence, rq_in_driver is deceptively + * this request, without incrementing tot_rq_in_driver. As + * a negative consequence, tot_rq_in_driver is deceptively * lower than it should be while this request is in * service. This may cause bfq_schedule_dispatch to be * invoked uselessly. @@ -5056,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * bfq_finish_requeue_request hook, if defined, is * probably invoked also on this request. So, by * exploiting this hook, we could 1) increment - * rq_in_driver here, and 2) decrement it in + * tot_rq_in_driver here, and 2) decrement it in * bfq_finish_requeue_request. Such a solution would * let the value of the counter be always accurate, * but it would entail using an extra interface @@ -5085,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * Of course, serving one request at a time may cause loss of * throughput. */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0) goto exit; bfqq = bfq_select_queue(bfqd); @@ -5096,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) if (rq) { inc_in_driver_start_rq: - bfqd->rq_in_driver++; + bfqd->rq_in_driver[bfqq->actuator_idx]++; + bfqd->tot_rq_in_driver++; start_rq: rq->rq_flags |= RQF_STARTED; } @@ -5283,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) */ __bfqq = bfqq->new_bfqq; while (__bfqq) { - if (__bfqq == bfqq) - break; next = __bfqq->new_bfqq; bfq_put_queue(__bfqq); __bfqq = next; @@ -5305,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_release_process_ref(bfqd, bfqq); } -static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) +static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx); struct bfq_data *bfqd; if (bfqq) bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ if (bfqq && bfqd) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - bic_set_bfqq(bic, NULL, is_sync); + bic_set_bfqq(bic, NULL, is_sync, actuator_idx); bfq_exit_bfqq(bfqd, bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); } } static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + unsigned long flags; + unsigned int act_idx; + /* + * If bfqd and thus bfqd->num_actuators is not available any + * longer, then cycle over all possible per-actuator bfqqs in + * next loop. We rely on bic being zeroed on creation, and + * therefore on its unused per-actuator fields being NULL. + */ + unsigned int num_actuators = BFQ_MAX_ACTUATORS; + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - if (bic->stable_merge_bfqq) { - struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + /* + * bfqd is NULL if scheduler already exited, and in that case + * this is the last time these queues are accessed. + */ + if (bfqd) { + spin_lock_irqsave(&bfqd->lock, flags); + num_actuators = bfqd->num_actuators; + } - /* - * bfqd is NULL if scheduler already exited, and in - * that case this is the last time bfqq is accessed. - */ - if (bfqd) { - unsigned long flags; + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - spin_lock_irqsave(&bfqd->lock, flags); - bfq_put_stable_ref(bic->stable_merge_bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - bfq_put_stable_ref(bic->stable_merge_bfqq); - } + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); } - bfq_exit_icq_bfqq(bic, true); - bfq_exit_icq_bfqq(bic, false); + if (bfqd) + spin_unlock_irqrestore(&bfqd->lock, flags); } /* @@ -5423,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bic->ioprio = ioprio; - bfqq = bic_to_bfqq(bic, false); + bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio)); if (bfqq) { struct bfq_queue *old_bfqq = bfqq; bfqq = bfq_get_queue(bfqd, bio, false, bic, true); - bic_set_bfqq(bic, bfqq, false); + bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio)); bfq_release_process_ref(bfqd, old_bfqq); } - bfqq = bic_to_bfqq(bic, true); + bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio)); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, pid_t pid, int is_sync) + struct bfq_io_cq *bic, pid_t pid, int is_sync, + unsigned int act_idx) { u64 now_ns = ktime_get_ns(); + bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5501,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* first request is almost certainly seeky */ bfqq->seek_history = 1; + + bfqq->decrease_time_jif = jiffies; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, struct bfq_group *bfqg, - int ioprio_class, int ioprio) + int ioprio_class, int ioprio, int act_idx) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; + return &bfqg->async_bfqq[0][ioprio][act_idx]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; + return &bfqg->async_bfqq[1][ioprio][act_idx]; case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; + return &bfqg->async_idle_bfqq[act_idx]; default: return NULL; } @@ -5527,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, struct bfq_queue *last_bfqq_created) { + unsigned int a_idx = last_bfqq_created->actuator_idx; struct bfq_queue *new_bfqq = bfq_setup_merge(bfqq, last_bfqq_created); @@ -5534,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, return bfqq; if (new_bfqq->bic) - new_bfqq->bic->stably_merged = true; - bic->stably_merged = true; + new_bfqq->bic->bfqq_data[a_idx].stably_merged = true; + bic->bfqq_data[a_idx].stably_merged = true; /* * Reusing merge functions. This implies that @@ -5610,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, * it has been set already, but too long ago, then move it * forward to bfqq. Finally, move also if bfqq belongs to a * different group than last_bfqq_created, or if bfqq has a - * different ioprio or ioprio_class. If none of these - * conditions holds true, then try an early stable merge or - * schedule a delayed stable merge. + * different ioprio, ioprio_class or actuator_idx. If none of + * these conditions holds true, then try an early stable merge + * or schedule a delayed stable merge. As for the condition on + * actuator_idx, the reason is that, if queues associated with + * different actuators are merged, then control is lost on + * each actuator. Therefore some actuator may be + * underutilized, and throughput may decrease. * * A delayed merge is scheduled (instead of performing an * early merge), in case bfqq might soon prove to be more @@ -5630,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, bfqq->creation_time) || bfqq->entity.parent != last_bfqq_created->entity.parent || bfqq->ioprio != last_bfqq_created->ioprio || - bfqq->ioprio_class != last_bfqq_created->ioprio_class) + bfqq->ioprio_class != last_bfqq_created->ioprio_class || + bfqq->actuator_idx != last_bfqq_created->actuator_idx) *source_bfqq = bfqq; else if (time_after_eq(last_bfqq_created->creation_time + bfqd->bfq_burst_interval, @@ -5660,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, /* * Record the bfqq to merge to. */ - bic->stable_merge_bfqq = last_bfqq_created; + bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq = + last_bfqq_created; } } @@ -5682,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); + ioprio, + bfq_actuator_index(bfqd, bio)); bfqq = *async_bfqq; if (bfqq) goto out; @@ -5694,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, if (bfqq) { bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); + is_sync, bfq_actuator_index(bfqd, bio)); bfq_init_entity(&bfqq->entity, bfqg); bfq_log_bfqq(bfqd, bfqq, "allocated"); } else { @@ -6009,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * then complete the merge and redirect it to * new_bfqq. */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + if (bic_to_bfqq(RQ_BIC(rq), true, + bfq_actuator_index(bfqd, rq->bio)) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); @@ -6147,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) struct bfq_queue *bfqq = bfqd->in_service_queue; bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->tot_rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -6158,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) * sum is not exact, as it's not taking into account deactivated * requests. */ - if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) + if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) return; /* @@ -6169,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < BFQ_HW_QUEUE_THRESHOLD && - bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) return; if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) @@ -6190,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_update_hw_tag(bfqd); - bfqd->rq_in_driver--; + bfqd->rq_in_driver[bfqq->actuator_idx]--; + bfqd->tot_rq_in_driver--; bfqq->dispatched--; if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { @@ -6310,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) BFQQE_NO_MORE_REQUESTS); } - if (!bfqd->rq_in_driver) + if (!bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); } @@ -6441,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, * conditions to do it, or we can lower the last base value * computed. * - * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O + * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O * request in flight, because this function is in the code * path that handles the completion of a request of bfqq, and, * in particular, this function is executed before - * bfqd->rq_in_driver is decremented in such a code path. + * bfqd->tot_rq_in_driver is decremented in such a code path. */ - if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || + if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { if (bfqq->last_serv_time_ns == 0) { /* @@ -6457,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } bfqq->last_serv_time_ns = tot_time_ns; - } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) + } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1) /* * No I/O injected and no request still in service in * the drive: these are the exact conditions for @@ -6564,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return bfqq; } - bic_set_bfqq(bic, NULL, true); + bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); @@ -6578,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bool split, bool is_sync, bool *new_queue) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + unsigned int act_idx = bfq_actuator_index(bfqd, bio); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx]; if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) return bfqq; @@ -6590,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bfq_put_queue(bfqq); bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); - bic_set_bfqq(bic, bfqq, is_sync); + bic_set_bfqq(bic, bfqq, is_sync, act_idx); if (split && is_sync) { - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + if ((bfqq_data->was_in_burst_list && bfqd->large_burst) || + bfqq_data->saved_in_large_burst) bfq_mark_bfqq_in_large_burst(bfqq); else { bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) + if (bfqq_data->was_in_burst_list) /* * If bfqq was in the current * burst list before being @@ -6686,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_queue *bfqq; bool new_queue = false; bool bfqq_already_existing = false, split = false; + unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) return NULL; /* - * Assuming that elv.priv[1] is set only if everything is set + * Assuming that RQ_BFQQ(rq) is set only if everything is set * for this rq. This holds true, because this function is * invoked only for insertion or merging, and, after such * events, a request cannot be manipulated any longer before * being removed from bfq. */ - if (rq->elv.priv[1]) - return rq->elv.priv[1]; + if (RQ_BFQQ(rq)) + return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); @@ -6712,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && - !bic->stably_merged) { + !bic->bfqq_data[a_idx].stably_merged) { struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) - bic->saved_in_large_burst = true; + bic->bfqq_data[a_idx].saved_in_large_burst = + true; bfqq = bfq_split_bfqq(bic, bfqq); split = true; @@ -6900,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, */ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_NR_LEVELS; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]); - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]); + } } /* @@ -7018,6 +7183,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { struct bfq_data *bfqd; struct elevator_queue *eq; + unsigned int i; + struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; eq = elevator_alloc(q, e); if (!eq) @@ -7038,8 +7205,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. + * Set zero as actuator index: we will pretend that + * all I/O requests are for the same actuator. */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0); bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; @@ -7058,6 +7227,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->queue = q; + bfqd->num_actuators = 1; + /* + * If the disk supports multiple actuators, copy independent + * access ranges from the request queue structure. + */ + spin_lock_irq(&q->queue_lock); + if (ia_ranges) { + /* + * Check if the disk ia_ranges size exceeds the current bfq + * actuator limit. + */ + if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) { + pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n", + ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS); + pr_crit("Falling back to single actuator mode.\n"); + } else { + bfqd->num_actuators = ia_ranges->nr_ia_ranges; + + for (i = 0; i < bfqd->num_actuators; i++) { + bfqd->sector[i] = ia_ranges->ia_range[i].sector; + bfqd->nr_sectors[i] = + ia_ranges->ia_range[i].nr_sectors; + } + } + } + + /* Otherwise use single-actuator dev info */ + if (bfqd->num_actuators == 1) { + bfqd->sector[0] = 0; + bfqd->nr_sectors[0] = get_capacity(q->disk); + } + spin_unlock_irq(&q->queue_lock); + INIT_LIST_HEAD(&bfqd->dispatch); hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, @@ -7069,7 +7271,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->num_groups_with_pending_reqs = 0; #endif - INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->active_list[0]); + INIT_LIST_HEAD(&bfqd->active_list[1]); INIT_LIST_HEAD(&bfqd->idle_list); INIT_HLIST_HEAD(&bfqd->burst_list); @@ -7095,7 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) */ bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); bfqd->bfq_wr_max_softrt_rate = 7000; /* @@ -7114,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + spin_lock_init(&bfqd->lock); /* @@ -7412,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { int ret; + char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.2"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -7443,6 +7649,11 @@ static int __init bfq_init(void) if (ret) goto slab_kill; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); + return 0; slab_kill: diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 466e4865ace6..75cc6a324267 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -33,6 +33,14 @@ */ #define BFQ_SOFTRT_WEIGHT_FACTOR 100 +/* + * Maximum number of actuators supported. This constant is used simply + * to define the size of the static array that will contain + * per-actuator data. The current value is hopefully a good upper + * bound to the possible number of actuators of any actual drive. + */ +#define BFQ_MAX_ACTUATORS 8 + struct bfq_entity; /** @@ -227,12 +235,14 @@ struct bfq_ttime { * struct bfq_queue - leaf schedulable entity. * * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating - * processes. @cgroup holds a reference to the cgroup, to be sure that it - * does not disappear while a bfqq still references it (mostly to avoid - * races between request issuing and task migration followed by cgroup - * destruction). - * All the fields are protected by the queue lock of the containing bfqd. + * io_context or more, if it is async or shared between cooperating + * processes. Besides, it contains I/O requests for only one actuator + * (an io_context is associated with a different bfq_queue for each + * actuator it generates I/O for). @cgroup holds a reference to the + * cgroup, to be sure that it does not disappear while a bfqq still + * references it (mostly to avoid races between request issuing and + * task migration followed by cgroup destruction). All the fields are + * protected by the queue lock of the containing bfqd. */ struct bfq_queue { /* reference counter */ @@ -397,24 +407,18 @@ struct bfq_queue { * the woken queues when this queue exits. */ struct hlist_head woken_list; + + /* index of the actuator this queue is associated with */ + unsigned int actuator_idx; }; /** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ -struct bfq_io_cq { - /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; - /* per (request_queue, blkcg) ioprio */ - int ioprio; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif +* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq +*/ +struct bfq_iocq_bfqq_data { /* * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to + * to remember its values while the queue is merged, so as to * be able to restore it in case of split. */ bool saved_has_short_ttime; @@ -428,7 +432,7 @@ struct bfq_io_cq { u64 saved_tot_idle_time; /* - * Same purpose as the previous fields for the value of the + * Same purpose as the previous fields for the values of the * field keeping the queue's belonging to a large burst */ bool saved_in_large_burst; @@ -466,6 +470,38 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* + * Matrix of associated process queues: first row for async + * queues, second row sync queues. Each row contains one + * column for each actuator. An I/O request generated by the + * process is inserted into the queue pointed by bfqq[i][j] if + * the request is to be served by the j-th actuator of the + * drive, where i==0 or i==1, depending on whether the request + * is async or sync. So there is a distinct queue for each + * actuator. + */ + struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + + /* + * Persistent data for associated synchronous process queues + * (one queue per actuator, see field bfqq above). In + * particular, each of these queues may undergo a merge. + */ + struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; + unsigned int requests; /* Number of requests this process has in flight */ }; @@ -554,7 +590,12 @@ struct bfq_data { /* number of queued requests */ int queued; /* number of requests dispatched and waiting for completion */ - int rq_in_driver; + int tot_rq_in_driver; + /* + * number of requests dispatched and waiting for completion + * for each actuator + */ + int rq_in_driver[BFQ_MAX_ACTUATORS]; /* true if the device is non rotational and performs queueing */ bool nonrot_with_queueing; @@ -648,8 +689,13 @@ struct bfq_data { /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; - /* list of all the bfq_queues active on the device */ - struct list_head active_list; + /* + * List of all the bfq_queues active for a specific actuator + * on the device. Keeping active queues separate on a + * per-actuator basis helps implementing per-actuator + * injection more efficiently. + */ + struct list_head active_list[BFQ_MAX_ACTUATORS]; /* list of all the bfq_queues idle on the device */ struct list_head idle_list; @@ -723,8 +769,6 @@ struct bfq_data { * is multiplied. */ unsigned int bfq_wr_coeff; - /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; @@ -772,6 +816,42 @@ struct bfq_data { */ unsigned int word_depths[2][2]; unsigned int full_depth_shift; + + /* + * Number of independent actuators. This is equal to 1 in + * case of single-actuator drives. + */ + unsigned int num_actuators; + /* + * Disk independent access ranges for each actuator + * in this device. + */ + sector_t sector[BFQ_MAX_ACTUATORS]; + sector_t nr_sectors[BFQ_MAX_ACTUATORS]; + struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; + + /* + * If the number of I/O requests queued in the device for a + * given actuator is below next threshold, then the actuator + * is deemed as underutilized. If this condition is found to + * hold for some actuator upon a dispatch, but (i) the + * in-service queue does not contain I/O for that actuator, + * while (ii) some other queue does contain I/O for that + * actuator, then the head I/O request of the latter queue is + * returned (injected), instead of the head request of the + * currently in-service queue. + * + * We set the threshold, empirically, to the minimum possible + * value for which an actuator is fully utilized, or close to + * be fully utilized. By doing so, injected I/O 'steals' as + * few drive-queue slots as possibile to the in-service + * queue. This reduces as much as possible the probability + * that the service of I/O from the in-service bfq_queue gets + * delayed because of slot exhaustion, i.e., because all the + * slots of the drive queue are filled with I/O injected from + * other queues (NCQ provides for 32 slots). + */ + unsigned int actuator_load_threshold; }; enum bfqq_state_flags { @@ -937,8 +1017,8 @@ struct bfq_group { struct bfq_data *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct bfq_entity *my_entity; @@ -955,8 +1035,8 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct rb_root rq_pos_tree; }; @@ -969,8 +1049,10 @@ struct bfq_group { extern const int bfq_timeout; -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx); +void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, + unsigned int actuator_idx); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index ea4c3d757fdd..7941b6f07391 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]); bfq_inc_active_entities(entity); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ac1efb053e0..4272599a3f08 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -33,7 +33,6 @@ #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" -#include "blk-rq-qos.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -626,69 +625,93 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** - * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update - * @inputp: input string pointer + * blkg_conf_init - initialize a blkg_conf_ctx + * @ctx: blkg_conf_ctx to initialize + * @input: input string + * + * Initialize @ctx which can be used to parse blkg config input string @input. + * Once initialized, @ctx can be used with blkg_conf_open_bdev() and + * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). + */ +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) +{ + *ctx = (struct blkg_conf_ctx){ .input = input }; +} +EXPORT_SYMBOL_GPL(blkg_conf_init); + +/** + * blkg_conf_open_bdev - parse and open bdev for per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching bdev. *@inputp is - * updated to point past the device node prefix. Returns an ERR_PTR() - * value on error. + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from + * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is + * set to point past the device node prefix. * - * Use this function iff blkg_conf_prep() can't be used for some reason. + * This function may be called multiple times on @ctx and the extra calls become + * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function + * explicitly if bdev access is needed without resolving the blkcg / policy part + * of @ctx->input. Returns -errno on error. */ -struct block_device *blkcg_conf_open_bdev(char **inputp) +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { - char *input = *inputp; + char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; + if (ctx->bdev) + return 0; + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) - return ERR_PTR(-EINVAL); + return -EINVAL; input += key_len; if (!isspace(*input)) - return ERR_PTR(-EINVAL); + return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor)); if (!bdev) - return ERR_PTR(-ENODEV); + return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); - return ERR_PTR(-ENODEV); + return -ENODEV; } - *inputp = input; - return bdev; + ctx->body = input; + ctx->bdev = bdev; + return 0; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy - * @input: input string - * @ctx: blkg_conf_ctx to be filled + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() + * + * Parse per-blkg config update from @ctx->input and initialize @ctx + * accordingly. On success, @ctx->body points to the part of @ctx->input + * following MAJ:MIN, @ctx->bdev points to the target block device and + * @ctx->blkg to the blkg being configured. * - * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->body the - * part of @input following MAJ:MIN. This function returns with RCU read - * lock and queue lock held and must be paired with blkg_conf_finish(). + * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this + * function returns with queue lock held and must be followed by + * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) + struct blkg_conf_ctx *ctx) + __acquires(&bdev->bd_queue->queue_lock) { - struct block_device *bdev; struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - disk = bdev->bd_disk; + ret = blkg_conf_open_bdev(ctx); + if (ret) + return ret; + + disk = ctx->bdev->bd_disk; q = disk->queue; /* @@ -699,7 +722,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (ret) goto fail; - rcu_read_lock(); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -728,7 +750,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { @@ -742,7 +763,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto fail_exit_queue; } - rcu_read_lock(); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -769,20 +789,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } success: blk_queue_exit(q); - ctx->bdev = bdev; ctx->blkg = blkg; - ctx->body = input; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); fail_exit_queue: blk_queue_exit(q); fail: - blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -798,20 +814,27 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, EXPORT_SYMBOL_GPL(blkg_conf_prep); /** - * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() + * blkg_conf_exit - clean up per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Finish up after per-blkg config update. This function must be paired - * with blkg_conf_prep(). + * Clean up after per-blkg config update. This function must be called on all + * blkg_conf_ctx's initialized with blkg_conf_init(). */ -void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) +void blkg_conf_exit(struct blkg_conf_ctx *ctx) + __releases(&ctx->bdev->bd_queue->queue_lock) { - spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); - rcu_read_unlock(); - blkdev_put_no_open(ctx->bdev); + if (ctx->blkg) { + spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); + ctx->blkg = NULL; + } + + if (ctx->bdev) { + blkdev_put_no_open(ctx->bdev); + ctx->body = NULL; + ctx->bdev = NULL; + } } -EXPORT_SYMBOL_GPL(blkg_conf_finish); +EXPORT_SYMBOL_GPL(blkg_conf_exit); static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { @@ -1300,14 +1323,8 @@ int blkcg_init_disk(struct gendisk *disk) if (ret) goto err_ioprio_exit; - ret = blk_iolatency_init(disk); - if (ret) - goto err_throtl_exit; - return 0; -err_throtl_exit: - blk_throtl_exit(disk); err_ioprio_exit: blk_ioprio_exit(disk); err_destroy_all: @@ -1323,7 +1340,6 @@ int blkcg_init_disk(struct gendisk *disk) void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); - rq_qos_exit(disk->queue); blk_throtl_exit(disk); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 1e94e404eaa8..fe09e8b4c2a8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -208,15 +208,17 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { + char *input; + char *body; struct block_device *bdev; struct blkcg_gq *blkg; - char *body; }; -struct block_device *blkcg_conf_open_bdev(char **inputp); +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); + struct blkg_conf_ctx *ctx); +void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ec7219caea16..c31d57e29bf8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3096,9 +3096,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, return nbytes; } - ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) - return ret; + goto err; iocg = blkg_to_iocg(ctx.blkg); @@ -3117,12 +3119,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return nbytes; einval: - blkg_conf_finish(&ctx); - return -EINVAL; + ret = -EINVAL; +err: + blkg_conf_exit(&ctx); + return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, @@ -3177,19 +3181,22 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct block_device *bdev; + struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; - char *p; + char *body, *p; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + blkg_conf_init(&ctx, input); - disk = bdev->bd_disk; + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + disk = ctx.bdev->bd_disk; ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); @@ -3206,7 +3213,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, enable = ioc->enabled; user = ioc->user_qos_params; - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -3295,7 +3302,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); @@ -3305,7 +3312,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ret = -EINVAL; err: - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return ret; } @@ -3356,22 +3363,25 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct block_device *bdev; + struct blkg_conf_ctx ctx; struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; - char *p; + char *body, *p; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + blkg_conf_init(&ctx, input); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; - q = bdev_get_queue(bdev); + body = ctx.body; + q = bdev_get_queue(ctx.bdev); ioc = q_to_ioc(q); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk); + ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(q); @@ -3384,7 +3394,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -3431,7 +3441,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return nbytes; einval: @@ -3442,7 +3452,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ret = -EINVAL; err: - blkdev_put_no_open(bdev); + blkg_conf_exit(&ctx); return ret; } diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ecdc10741836..3484393dbc4a 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -755,7 +755,7 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) } } -int blk_iolatency_init(struct gendisk *disk) +static int blk_iolatency_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_iolatency *blkiolat; @@ -830,6 +830,29 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) } } +static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) +{ + static DEFINE_MUTEX(init_mutex); + int ret; + + ret = blkg_conf_open_bdev(ctx); + if (ret) + return ret; + + /* + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can + * confuse iolat_rq_qos() test. Make the test and init atomic. + */ + mutex_lock(&init_mutex); + + if (!iolat_rq_qos(ctx->bdev->bd_queue)) + ret = blk_iolatency_init(ctx->bdev->bd_disk); + + mutex_unlock(&init_mutex); + + return ret; +} + static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -842,9 +865,15 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, u64 oldval; int ret; - ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blk_iolatency_try_init(&ctx); if (ret) - return ret; + goto out; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx); + if (ret) + goto out; iolat = blkg_to_lat(ctx.blkg); p = ctx.body; @@ -880,7 +909,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, iolatency_clear_scaling(blkg); ret = 0; out: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -974,7 +1003,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); u64 now = ktime_to_ns(ktime_get()); int cpu; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 1ef1f7d4bc3c..27f004fae66b 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -74,7 +74,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_WBT); } -static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 6fb5a2f9e1ee..75841d1d9bf4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1369,9 +1369,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, int ret; u64 v; - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) - return ret; + goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) @@ -1390,7 +1392,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_conf_updated(tg, false); ret = 0; out_finish: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -1562,9 +1564,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, int ret; int index = of_cft(of)->private; - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) - return ret; + goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); @@ -1663,7 +1667,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } diff --git a/block/blk.h b/block/blk.h index e835f21d48af..ad384230d0c7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -392,12 +392,6 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, return bio; } -#ifdef CONFIG_BLK_CGROUP_IOLATENCY -int blk_iolatency_init(struct gendisk *disk); -#else -static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; -#endif - #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); -- 2.40.0 From f3810e5d25c92b9a4a0da90dc5c39fe1421ee533 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 13 Feb 2023 11:26:20 +0100 Subject: [PATCH 03/15] bitmap Signed-off-by: Peter Jung --- include/linux/bitmap.h | 46 ++++++------- include/linux/cpumask.h | 144 +++++++++++++++++++-------------------- include/linux/find.h | 40 +++++------ include/linux/nodemask.h | 86 +++++++++++------------ 4 files changed, 158 insertions(+), 158 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7d6d73b78147..40e53a2ecc0d 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -189,7 +189,7 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ -static inline unsigned long +static __always_inline unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, @@ -237,7 +237,7 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -247,7 +247,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) memset(dst, 0, len); } -static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) +static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -257,7 +257,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) memset(dst, 0xff, len); } -static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -271,7 +271,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, /* * Copy bitmap and clear tail bits in last word. */ -static inline void bitmap_copy_clear_tail(unsigned long *dst, +static __always_inline void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); @@ -317,7 +317,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif -static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, +static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -325,7 +325,7 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, return __bitmap_and(dst, src1, src2, nbits); } -static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, +static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -334,7 +334,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } -static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, +static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -343,7 +343,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, __bitmap_xor(dst, src1, src2, nbits); } -static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, +static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -351,7 +351,7 @@ static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, return __bitmap_andnot(dst, src1, src2, nbits); } -static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -367,7 +367,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -static inline bool bitmap_equal(const unsigned long *src1, +static __always_inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -387,7 +387,7 @@ static inline bool bitmap_equal(const unsigned long *src1, * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ -static inline bool bitmap_or_equal(const unsigned long *src1, +static __always_inline bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits) @@ -398,7 +398,7 @@ static inline bool bitmap_or_equal(const unsigned long *src1, return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } -static inline bool bitmap_intersects(const unsigned long *src1, +static __always_inline bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { @@ -408,7 +408,7 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -static inline bool bitmap_subset(const unsigned long *src1, +static __always_inline bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -417,7 +417,7 @@ static inline bool bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) +static __always_inline bool bitmap_empty(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -425,7 +425,7 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) return find_first_bit(src, nbits) == nbits; } -static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) +static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); @@ -482,7 +482,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, __bitmap_clear(map, start, nbits); } -static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -491,7 +491,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s __bitmap_shift_right(dst, src, shift, nbits); } -static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -500,7 +500,7 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr __bitmap_shift_left(dst, src, shift, nbits); } -static inline void bitmap_replace(unsigned long *dst, +static __always_inline void bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, @@ -512,7 +512,7 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline void bitmap_next_set_region(unsigned long *bitmap, +static __always_inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) { @@ -563,7 +563,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ -static inline void bitmap_from_u64(unsigned long *dst, u64 mask) +static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } @@ -576,7 +576,7 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) * Returns the 8-bit value located at the @start bit offset within the @src * memory region. */ -static inline unsigned long bitmap_get_value8(const unsigned long *map, +static __always_inline unsigned long bitmap_get_value8(const unsigned long *map, unsigned long start) { const size_t index = BIT_WORD(start); @@ -591,7 +591,7 @@ static inline unsigned long bitmap_get_value8(const unsigned long *map, * @value: the 8-bit value; values wider than 8 bits may clobber bitmap * @start: bit offset of the 8-bit value; must be a multiple of 8 */ -static inline void bitmap_set_value8(unsigned long *map, unsigned long value, +static __always_inline void bitmap_set_value8(unsigned long *map, unsigned long value, unsigned long start) { const size_t index = BIT_WORD(start); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c2aa0aa26b45..9543b22d6dc2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -41,7 +41,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern unsigned int nr_cpu_ids; #endif -static inline void set_nr_cpu_ids(unsigned int nr) +static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); @@ -124,7 +124,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu) * * Returns >= nr_cpu_ids if no cpus set. */ -static inline unsigned int cpumask_first(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } @@ -135,7 +135,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) * * Returns >= nr_cpu_ids if all cpus are set. */ -static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); } @@ -147,7 +147,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ -static inline +static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); @@ -159,7 +159,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask * * Returns >= nr_cpumask_bits if no CPUs set. */ -static inline unsigned int cpumask_last(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } @@ -171,7 +171,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) * * Returns >= nr_cpu_ids if no further cpus set. */ -static inline +static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ @@ -187,7 +187,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) * * Returns >= nr_cpu_ids if no further cpus unset. */ -static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) +static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) @@ -197,18 +197,18 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ -static inline unsigned int cpumask_local_spread(unsigned int i, int node) +static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } -static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, +static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } -static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } @@ -227,7 +227,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp); * * Returns >= nr_cpu_ids if no further cpus set in both. */ -static inline +static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -259,7 +259,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits) #if NR_CPUS == 1 -static inline +static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); @@ -335,7 +335,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta * Often used to find any cpu but smp_processor_id() in a mask. * Returns >= nr_cpu_ids if no cpus set. */ -static inline +static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { unsigned int i; @@ -354,7 +354,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) * * Returns >= nr_cpu_ids if such cpu doesn't exist. */ -static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) +static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu)); } @@ -367,7 +367,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s * * Returns >= nr_cpu_ids if such cpu doesn't exist. */ -static inline +static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -383,7 +383,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, * * Returns >= nr_cpu_ids if such cpu doesn't exist. */ -static inline +static __always_inline unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -476,7 +476,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask * * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ -static inline void cpumask_setall(struct cpumask *dstp) +static __always_inline void cpumask_setall(struct cpumask *dstp) { bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } @@ -485,7 +485,7 @@ static inline void cpumask_setall(struct cpumask *dstp) * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ -static inline void cpumask_clear(struct cpumask *dstp) +static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); } @@ -498,7 +498,7 @@ static inline void cpumask_clear(struct cpumask *dstp) * * If *@dstp is empty, returns false, else returns true */ -static inline bool cpumask_and(struct cpumask *dstp, +static __always_inline bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -512,7 +512,7 @@ static inline bool cpumask_and(struct cpumask *dstp, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, +static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), @@ -525,7 +525,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_xor(struct cpumask *dstp, +static __always_inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -541,7 +541,7 @@ static inline void cpumask_xor(struct cpumask *dstp, * * If *@dstp is empty, returns false, else returns true */ -static inline bool cpumask_andnot(struct cpumask *dstp, +static __always_inline bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -554,7 +554,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp, * @dstp: the cpumask result * @srcp: the input to invert */ -static inline void cpumask_complement(struct cpumask *dstp, +static __always_inline void cpumask_complement(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), @@ -566,7 +566,7 @@ static inline void cpumask_complement(struct cpumask *dstp, * @src1p: the first input * @src2p: the second input */ -static inline bool cpumask_equal(const struct cpumask *src1p, +static __always_inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), @@ -579,7 +579,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p, * @src2p: the second input * @src3p: the third input */ -static inline bool cpumask_or_equal(const struct cpumask *src1p, +static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { @@ -592,7 +592,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p, * @src1p: the first input * @src2p: the second input */ -static inline bool cpumask_intersects(const struct cpumask *src1p, +static __always_inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), @@ -606,7 +606,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p, * * Returns true if *@src1p is a subset of *@src2p, else returns false */ -static inline bool cpumask_subset(const struct cpumask *src1p, +static __always_inline bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), @@ -617,7 +617,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p, * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. */ -static inline bool cpumask_empty(const struct cpumask *srcp) +static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); } @@ -626,7 +626,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp) * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. */ -static inline bool cpumask_full(const struct cpumask *srcp) +static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } @@ -635,7 +635,7 @@ static inline bool cpumask_full(const struct cpumask *srcp) * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. */ -static inline unsigned int cpumask_weight(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); } @@ -645,7 +645,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. */ -static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, +static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); @@ -657,7 +657,7 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, * @srcp: the input to shift * @n: the number of bits to shift by */ -static inline void cpumask_shift_right(struct cpumask *dstp, +static __always_inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, @@ -670,7 +670,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp, * @srcp: the input to shift * @n: the number of bits to shift by */ -static inline void cpumask_shift_left(struct cpumask *dstp, +static __always_inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, @@ -682,7 +682,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp, * @dstp: the result * @srcp: the input cpumask */ -static inline void cpumask_copy(struct cpumask *dstp, +static __always_inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); @@ -719,7 +719,7 @@ static inline void cpumask_copy(struct cpumask *dstp, * * Returns -errno, or 0 for success. */ -static inline int cpumask_parse_user(const char __user *buf, int len, +static __always_inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); @@ -733,7 +733,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len, * * Returns -errno, or 0 for success. */ -static inline int cpumask_parselist_user(const char __user *buf, int len, +static __always_inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), @@ -747,7 +747,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, * * Returns -errno, or 0 for success. */ -static inline int cpumask_parse(const char *buf, struct cpumask *dstp) +static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } @@ -759,7 +759,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) * * Returns -errno, or 0 for success. */ -static inline int cpulist_parse(const char *buf, struct cpumask *dstp) +static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } @@ -767,7 +767,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes */ -static inline unsigned int cpumask_size(void) +static __always_inline unsigned int cpumask_size(void) { return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); } @@ -820,7 +820,7 @@ typedef struct cpumask *cpumask_var_t; bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); -static inline +static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); @@ -836,13 +836,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) * * See alloc_cpumask_var_node. */ -static inline +static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } -static inline +static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); @@ -852,7 +852,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); -static inline bool cpumask_available(cpumask_var_t mask) +static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } @@ -863,43 +863,43 @@ typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly -static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } -static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, +static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } -static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } -static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, +static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } -static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) +static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } -static inline void free_cpumask_var(cpumask_var_t mask) +static __always_inline void free_cpumask_var(cpumask_var_t mask) { } -static inline void free_bootmem_cpumask_var(cpumask_var_t mask) +static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } -static inline bool cpumask_available(cpumask_var_t mask) +static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } @@ -929,12 +929,12 @@ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); -static inline void reset_cpu_possible_mask(void) +static __always_inline void reset_cpu_possible_mask(void) { bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); } -static inline void +static __always_inline void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -943,7 +943,7 @@ set_cpu_possible(unsigned int cpu, bool possible) cpumask_clear_cpu(cpu, &__cpu_possible_mask); } -static inline void +static __always_inline void set_cpu_present(unsigned int cpu, bool present) { if (present) @@ -954,7 +954,7 @@ set_cpu_present(unsigned int cpu, bool present) void set_cpu_online(unsigned int cpu, bool online); -static inline void +static __always_inline void set_cpu_active(unsigned int cpu, bool active) { if (active) @@ -963,7 +963,7 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } -static inline void +static __always_inline void set_cpu_dying(unsigned int cpu, bool dying) { if (dying) @@ -986,7 +986,7 @@ set_cpu_dying(unsigned int cpu, bool dying) ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) -static inline int __check_is_bitmap(const unsigned long *bitmap) +static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } @@ -1001,7 +1001,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap) extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; -static inline const struct cpumask *get_cpu_mask(unsigned int cpu) +static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; @@ -1017,7 +1017,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ -static inline unsigned int num_online_cpus(void) +static __always_inline unsigned int num_online_cpus(void) { return atomic_read(&__num_online_cpus); } @@ -1025,27 +1025,27 @@ static inline unsigned int num_online_cpus(void) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) -static inline bool cpu_online(unsigned int cpu) +static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } -static inline bool cpu_possible(unsigned int cpu) +static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } -static inline bool cpu_present(unsigned int cpu) +static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } -static inline bool cpu_active(unsigned int cpu) +static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } -static inline bool cpu_dying(unsigned int cpu) +static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } @@ -1057,27 +1057,27 @@ static inline bool cpu_dying(unsigned int cpu) #define num_present_cpus() 1U #define num_active_cpus() 1U -static inline bool cpu_online(unsigned int cpu) +static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_possible(unsigned int cpu) +static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_present(unsigned int cpu) +static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_active(unsigned int cpu) +static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_dying(unsigned int cpu) +static __always_inline bool cpu_dying(unsigned int cpu) { return false; } @@ -1111,7 +1111,7 @@ static inline bool cpu_dying(unsigned int cpu) * Returns the length of the (null-terminated) @buf string, zero if * nothing is copied. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), @@ -1134,7 +1134,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) * Returns the length of how many bytes have been copied, excluding * terminating '\0'. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { @@ -1149,7 +1149,7 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { diff --git a/include/linux/find.h b/include/linux/find.h index ccaf61a0f5fd..db2f2851601d 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -45,7 +45,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -74,7 +74,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -105,7 +105,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -134,7 +134,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ -static inline +static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -161,7 +161,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -187,7 +187,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) @@ -212,7 +212,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { @@ -239,7 +239,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { @@ -265,7 +265,7 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) @@ -289,7 +289,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -static inline +static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -310,7 +310,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) * * Returns the bit number of the last set bit, or size. */ -static inline +static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -333,7 +333,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -356,7 +356,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -373,7 +373,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ -static inline +static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { @@ -414,19 +414,19 @@ extern unsigned long find_next_clump8(unsigned long *clump, #if defined(__LITTLE_ENDIAN) -static inline unsigned long find_next_zero_bit_le(const void *addr, +static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } -static inline unsigned long find_next_bit_le(const void *addr, +static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } -static inline unsigned long find_first_zero_bit_le(const void *addr, +static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); @@ -435,7 +435,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le -static inline +static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { @@ -454,7 +454,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned #endif #ifndef find_first_zero_bit_le -static inline +static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -468,7 +468,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) #endif #ifndef find_next_bit_le -static inline +static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bb0ee80526b2..8c04254c5284 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_; */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) -static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) +static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } -static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) +static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } @@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp) } #define node_clear(node, dst) __node_clear((node), &(dst)) -static inline void __node_clear(int node, volatile nodemask_t *dstp) +static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) -static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) +static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) -static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) +static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) -static inline bool __node_test_and_set(int node, nodemask_t *addr) +static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) -static inline void __nodes_complement(nodemask_t *dstp, +static __always_inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); @@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp, #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_equal(const nodemask_t *src1p, +static __always_inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); @@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p, #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_intersects(const nodemask_t *src1p, +static __always_inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); @@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p, #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_subset(const nodemask_t *src1p, +static __always_inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) -static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) +static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) -static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) +static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) -static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) +static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) -static inline void __nodes_shift_right(nodemask_t *dstp, +static __always_inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); @@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp, #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) -static inline void __nodes_shift_left(nodemask_t *dstp, +static __always_inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); @@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp, > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) -static inline unsigned int __first_node(const nodemask_t *srcp) +static __always_inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) -static inline unsigned int __next_node(int n, const nodemask_t *srcp) +static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } @@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp) * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) -static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) +static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); @@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) return ret; } -static inline void init_nodemask_of_node(nodemask_t *mask, int node) +static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); @@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node) }) #define first_unset_node(mask) __first_unset_node(&(mask)) -static inline unsigned int __first_unset_node(const nodemask_t *maskp) +static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); @@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) -static inline int __nodemask_parse_user(const char __user *buf, int len, +static __always_inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) -static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) +static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) -static inline int __node_remap(int oldbit, +static __always_inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); @@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit, #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) -static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, +static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); @@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) -static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, +static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); @@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) -static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, +static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); @@ -418,22 +418,22 @@ enum node_states { extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 -static inline int node_state(int node, enum node_states state) +static __always_inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } -static inline void node_set_state(int node, enum node_states state) +static __always_inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } -static inline void node_clear_state(int node, enum node_states state) +static __always_inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } -static inline int num_node_state(enum node_states state) +static __always_inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } @@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) -static inline unsigned int next_online_node(int nid) +static __always_inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } -static inline unsigned int next_memory_node(int nid) +static __always_inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } @@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid) extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; -static inline void node_set_online(int nid) +static __always_inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } -static inline void node_set_offline(int nid) +static __always_inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); @@ -469,20 +469,20 @@ static inline void node_set_offline(int nid) #else -static inline int node_state(int node, enum node_states state) +static __always_inline int node_state(int node, enum node_states state) { return node == 0; } -static inline void node_set_state(int node, enum node_states state) +static __always_inline void node_set_state(int node, enum node_states state) { } -static inline void node_clear_state(int node, enum node_states state) +static __always_inline void node_clear_state(int node, enum node_states state) { } -static inline int num_node_state(enum node_states state) +static __always_inline int num_node_state(enum node_states state) { return 1; } @@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state) #endif -static inline int node_random(const nodemask_t *maskp) +static __always_inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; -- 2.40.0 From 955ced3981bf578dc6cc299ed5f620f68ae29c39 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 13 Apr 2023 18:13:45 +0200 Subject: [PATCH 04/15] cachy Signed-off-by: Peter Jung --- .gitignore | 1 + .../admin-guide/kernel-parameters.txt | 11 +- Documentation/dontdiff | 1 + Makefile | 8 +- arch/arc/configs/axs101_defconfig | 1 + arch/arc/configs/axs103_defconfig | 1 + arch/arc/configs/axs103_smp_defconfig | 1 + arch/arc/configs/haps_hs_defconfig | 1 + arch/arc/configs/haps_hs_smp_defconfig | 1 + arch/arc/configs/hsdk_defconfig | 1 + arch/arc/configs/nsim_700_defconfig | 1 + arch/arc/configs/nsimosci_defconfig | 1 + arch/arc/configs/nsimosci_hs_defconfig | 1 + arch/arc/configs/nsimosci_hs_smp_defconfig | 1 + arch/arc/configs/tb10x_defconfig | 1 + arch/arc/configs/vdk_hs38_defconfig | 1 + arch/arc/configs/vdk_hs38_smp_defconfig | 1 + arch/x86/Kconfig.cpu | 416 ++++++++++- arch/x86/Makefile | 45 +- arch/x86/Makefile.postlink | 41 ++ arch/x86/boot/compressed/.gitignore | 1 - arch/x86/boot/compressed/Makefile | 10 +- arch/x86/include/asm/vermagic.h | 72 ++ drivers/Makefile | 15 +- drivers/i2c/busses/Kconfig | 9 + drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-nct6775.c | 647 ++++++++++++++++++ drivers/i2c/busses/i2c-piix4.c | 4 +- drivers/md/dm-crypt.c | 5 + drivers/pci/quirks.c | 102 +++ drivers/platform/x86/Kconfig | 14 + drivers/platform/x86/Makefile | 3 + drivers/platform/x86/steamdeck.c | 523 ++++++++++++++ include/linux/pagemap.h | 2 +- include/linux/user_namespace.h | 4 + include/net/netns/ipv4.h | 1 + include/trace/events/tcp.h | 7 + init/Kconfig | 39 ++ kernel/Kconfig.hz | 24 + kernel/fork.c | 14 + kernel/module/Kconfig | 25 + kernel/rcu/Kconfig | 4 +- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree.c | 6 +- kernel/rcu/tree_nocb.h | 4 +- kernel/rcu/tree_plugin.h | 4 +- kernel/sched/fair.c | 20 +- kernel/sysctl.c | 12 + kernel/user_namespace.c | 7 + mm/Kconfig | 2 +- mm/compaction.c | 4 + mm/page-writeback.c | 8 + mm/swap.c | 5 + mm/vmpressure.c | 4 + mm/vmscan.c | 8 + net/ipv4/sysctl_net_ipv4.c | 7 + net/ipv4/tcp_input.c | 36 + net/ipv4/tcp_ipv4.c | 2 + scripts/Makefile.lib | 13 +- scripts/Makefile.modinst | 7 +- 60 files changed, 2156 insertions(+), 57 deletions(-) create mode 100644 arch/x86/Makefile.postlink create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/.gitignore b/.gitignore index 20dce5c3b9e0..466c23de56ce 100644 --- a/.gitignore +++ b/.gitignore @@ -63,6 +63,7 @@ modules.order /vmlinux /vmlinux.32 /vmlinux.map +/vmlinux.relocs /vmlinux.symvers /vmlinux-gdb.py /vmlinuz diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6cfa6e3996cf..54239257209c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4178,6 +4178,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. + pcie_acs_override = + [PCIE] Override missing PCIe ACS support for: + downstream + All downstream ports - full ACS capabilities + multifunction + All multifunction devices - multifunction ACS subset + id:nnnn:nnnn + Specific device - full ACS capabilities + Specified as vid:did (vendor/device ID) in hex noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. @@ -4751,7 +4760,7 @@ overwritten. rcutree.kthread_prio= [KNL,BOOT] - Set the SCHED_FIFO priority of the RCU per-CPU + Set the SCHED_RR priority of the RCU per-CPU kthreads (rcuc/N). This value is also used for the priority of the RCU boost threads (rcub/N) and for the RCU grace-period kthreads (rcu_bh, diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 352ff53a2306..7c210744d84c 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -255,6 +255,7 @@ vmlinux.aout vmlinux.bin.all vmlinux.lds vmlinux.map +vmlinux.relocs vmlinux.symvers vmlinuz voffset.h diff --git a/Makefile b/Makefile index 068374cc2601..f7890837c555 100644 --- a/Makefile +++ b/Makefile @@ -834,6 +834,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 +else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 +KBUILD_CFLAGS += -O3 +KBUILD_RUSTFLAGS += -Copt-level=3 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s @@ -1075,11 +1078,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check -# conserve stack if available -ifdef CONFIG_CC_IS_GCC -KBUILD_CFLAGS += -fconserve-stack -endif - # Prohibit date/time macros, which would make the build non-deterministic KBUILD_CFLAGS += -Werror=date-time diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index 81764160451f..2c15d3bf747a 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index d5181275490e..7d868e148d9a 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index 2f336d99a8cf..777a9f21eb6b 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig index 899b2fd5c71d..bda15a876849 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index 0d32aac8069f..dbd74fea69aa 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index d18378d2c2a6..2396ca417182 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_DEV_RAM=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig index 3e9829775992..5044609540cc 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index 502c87f351c8..748c809d1c4c 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig index f721cc3997d0..205c32b0074c 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig index 1419fc946a08..2477b7c80977 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set CONFIG_KPROBES=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 6f0d2be9d926..cf02ad0fc210 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" CONFIG_INITRAMFS_ROOT_UID=2100 CONFIG_INITRAMFS_ROOT_GID=501 # CONFIG_RD_GZIP is not set +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index d3ef189c75f8..922b1b24f518 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index 944b347025fd..ed64319f7eb2 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 542377cd419d..08d887d1220d 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -157,7 +157,7 @@ config MPENTIUM4 config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 help Select this for an AMD K6-family processor. Enables use of @@ -165,7 +165,7 @@ config MK6 flags to GCC. config MK7 - bool "Athlon/Duron/K7" + bool "AMD Athlon/Duron/K7" depends on X86_32 help Select this for an AMD Athlon K7-family processor. Enables use of @@ -173,12 +173,106 @@ config MK7 flags to GCC. config MK8 - bool "Opteron/Athlon64/Hammer/K8" + bool "AMD Opteron/Athlon64/Hammer/K8" help Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables use of some extended instructions, and passes appropriate optimization flags to GCC. +config MK8SSE3 + bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" + help + Select this for improved AMD Opteron or Athlon64 Hammer-family processors. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MK10 + bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" + help + Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, + Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MBARCELONA + bool "AMD Barcelona" + help + Select this for AMD Family 10h Barcelona processors. + + Enables -march=barcelona + +config MBOBCAT + bool "AMD Bobcat" + help + Select this for AMD Family 14h Bobcat processors. + + Enables -march=btver1 + +config MJAGUAR + bool "AMD Jaguar" + help + Select this for AMD Family 16h Jaguar processors. + + Enables -march=btver2 + +config MBULLDOZER + bool "AMD Bulldozer" + help + Select this for AMD Family 15h Bulldozer processors. + + Enables -march=bdver1 + +config MPILEDRIVER + bool "AMD Piledriver" + help + Select this for AMD Family 15h Piledriver processors. + + Enables -march=bdver2 + +config MSTEAMROLLER + bool "AMD Steamroller" + help + Select this for AMD Family 15h Steamroller processors. + + Enables -march=bdver3 + +config MEXCAVATOR + bool "AMD Excavator" + help + Select this for AMD Family 15h Excavator processors. + + Enables -march=bdver4 + +config MZEN + bool "AMD Zen" + help + Select this for AMD Family 17h Zen processors. + + Enables -march=znver1 + +config MZEN2 + bool "AMD Zen 2" + help + Select this for AMD Family 17h Zen 2 processors. + + Enables -march=znver2 + +config MZEN3 + bool "AMD Zen 3" + depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + help + Select this for AMD Family 19h Zen 3 processors. + + Enables -march=znver3 + +config MZEN4 + bool "AMD Zen 4" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + help + Select this for AMD Family 19h Zen 4 processors. + + Enables -march=znver4 + config MCRUSOE bool "Crusoe" depends on X86_32 @@ -270,7 +364,7 @@ config MPSC in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" help Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and @@ -278,6 +372,8 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) + Enables -march=core2 + config MATOM bool "Intel Atom" help @@ -287,6 +383,202 @@ config MATOM accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. +config MNEHALEM + bool "Intel Nehalem" + select X86_P6_NOP + help + + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" + select X86_P6_NOP + help + + Select this for the Intel Westmere formerly Nehalem-C family. + + Enables -march=westmere + +config MSILVERMONT + bool "Intel Silvermont" + select X86_P6_NOP + help + + Select this for the Intel Silvermont platform. + + Enables -march=silvermont + +config MGOLDMONT + bool "Intel Goldmont" + select X86_P6_NOP + help + + Select this for the Intel Goldmont platform including Apollo Lake and Denverton. + + Enables -march=goldmont + +config MGOLDMONTPLUS + bool "Intel Goldmont Plus" + select X86_P6_NOP + help + + Select this for the Intel Goldmont Plus platform including Gemini Lake. + + Enables -march=goldmont-plus + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" + select X86_P6_NOP + help + + Select this for 2nd Gen Core processors in the Sandy Bridge family. + + Enables -march=sandybridge + +config MIVYBRIDGE + bool "Intel Ivy Bridge" + select X86_P6_NOP + help + + Select this for 3rd Gen Core processors in the Ivy Bridge family. + + Enables -march=ivybridge + +config MHASWELL + bool "Intel Haswell" + select X86_P6_NOP + help + + Select this for 4th Gen Core processors in the Haswell family. + + Enables -march=haswell + +config MBROADWELL + bool "Intel Broadwell" + select X86_P6_NOP + help + + Select this for 5th Gen Core processors in the Broadwell family. + + Enables -march=broadwell + +config MSKYLAKE + bool "Intel Skylake" + select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake family. + + Enables -march=skylake + +config MSKYLAKEX + bool "Intel Skylake X" + select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake X family. + + Enables -march=skylake-avx512 + +config MCANNONLAKE + bool "Intel Cannon Lake" + select X86_P6_NOP + help + + Select this for 8th Gen Core processors + + Enables -march=cannonlake + +config MICELAKE + bool "Intel Ice Lake" + select X86_P6_NOP + help + + Select this for 10th Gen Core processors in the Ice Lake family. + + Enables -march=icelake-client + +config MCASCADELAKE + bool "Intel Cascade Lake" + select X86_P6_NOP + help + + Select this for Xeon processors in the Cascade Lake family. + + Enables -march=cascadelake + +config MCOOPERLAKE + bool "Intel Cooper Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) + select X86_P6_NOP + help + + Select this for Xeon processors in the Cooper Lake family. + + Enables -march=cooperlake + +config MTIGERLAKE + bool "Intel Tiger Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) + select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Tiger Lake family. + + Enables -march=tigerlake + +config MSAPPHIRERAPIDS + bool "Intel Sapphire Rapids" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Sapphire Rapids family. + + Enables -march=sapphirerapids + +config MROCKETLAKE + bool "Intel Rocket Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for eleventh-generation processors in the Rocket Lake family. + + Enables -march=rocketlake + +config MALDERLAKE + bool "Intel Alder Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for twelfth-generation processors in the Alder Lake family. + + Enables -march=alderlake + +config MRAPTORLAKE + bool "Intel Raptor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + select X86_P6_NOP + help + + Select this for thirteenth-generation processors in the Raptor Lake family. + + Enables -march=raptorlake + +config MMETEORLAKE + bool "Intel Meteor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + select X86_P6_NOP + help + + Select this for fourteenth-generation processors in the Meteor Lake family. + + Enables -march=meteorlake + config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 @@ -294,6 +586,50 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. +config GENERIC_CPU2 + bool "Generic-x86-64-v2" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs with min support of x86-64-v2. + +config GENERIC_CPU3 + bool "Generic-x86-64-v3" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64-v3 CPU with v3 instructions. + Run equally well on all x86-64 CPUs with min support of x86-64-v3. + +config GENERIC_CPU4 + bool "Generic-x86-64-v4" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64 CPU with v4 instructions. + Run equally well on all x86-64 CPUs with min support of x86-64-v4. + +config MNATIVE_INTEL + bool "Intel-Native optimizations autodetected by the compiler" + help + + Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. Do NOT use this + for AMD CPUs. Intel Only! + + Enables -march=native + +config MNATIVE_AMD + bool "AMD-Native optimizations autodetected by the compiler" + help + + Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. Do NOT use this + for Intel CPUs. AMD Only! + + Enables -march=native + endchoice config X86_GENERIC @@ -318,9 +654,17 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ + || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \ + || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 \ + || GENERIC_CPU4 default "4" if MELAN || M486SX || M486 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \ + || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_F00F_BUG def_bool y @@ -332,15 +676,27 @@ config X86_INVD_BUG config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \ + || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \ + || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \ + || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ + || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \ + || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \ + || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD # # P6_NOPs are a relatively minor optimization that require a family >= @@ -356,32 +712,62 @@ config X86_USE_PPRO_CHECKSUM config X86_P6_NOP def_bool y depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \ + || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \ + || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL) config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \ + || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ + || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \ + || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL \ + || MNATIVE_AMD) || X86_64 config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \ + || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ + || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \ + || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \ + || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 \ + || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ + || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \ + || MNATIVE_INTEL || MNATIVE_AMD) default "5" if X86_32 && X86_CMPXCHG64 default "4" config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \ + || M486SX || M486) && !UML config IA32_FEAT_CTL def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 73ed982d4100..cb4c6620b34a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -67,7 +67,7 @@ export BITS # # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -O3 -fno-tree-vectorize KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 ifeq ($(CONFIG_X86_KERNEL_IBT),y) @@ -151,8 +151,47 @@ else # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += -march=k8 cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom + cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 + cflags-$(CONFIG_MK10) += -march=amdfam10 + cflags-$(CONFIG_MBARCELONA) += -march=barcelona + cflags-$(CONFIG_MBOBCAT) += -march=btver1 + cflags-$(CONFIG_MJAGUAR) += -march=btver2 + cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 + cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm + cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm + cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm + cflags-$(CONFIG_MZEN) += -march=znver1 + cflags-$(CONFIG_MZEN2) += -march=znver2 + cflags-$(CONFIG_MZEN3) += -march=znver3 + cflags-$(CONFIG_MZEN4) += -march=znver4 + cflags-$(CONFIG_MNATIVE_INTEL) += -march=native + cflags-$(CONFIG_MNATIVE_AMD) += -march=native + cflags-$(CONFIG_MATOM) += -march=bonnell + cflags-$(CONFIG_MCORE2) += -march=core2 + cflags-$(CONFIG_MNEHALEM) += -march=nehalem + cflags-$(CONFIG_MWESTMERE) += -march=westmere + cflags-$(CONFIG_MSILVERMONT) += -march=silvermont + cflags-$(CONFIG_MGOLDMONT) += -march=goldmont + cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus + cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge + cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge + cflags-$(CONFIG_MHASWELL) += -march=haswell + cflags-$(CONFIG_MBROADWELL) += -march=broadwell + cflags-$(CONFIG_MSKYLAKE) += -march=skylake + cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 + cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake + cflags-$(CONFIG_MICELAKE) += -march=icelake-client + cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake + cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake + cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake + cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids + cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake + cflags-$(CONFIG_MALDERLAKE) += -march=alderlake + cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake + cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake + cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 + cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 + cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 index 000000000000..195af937aa4d --- /dev/null +++ b/arch/x86/Makefile.postlink @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link x86 pass +# =========================================================================== +# +# 1. Separate relocations from vmlinux into vmlinux.relocs. +# 2. Strip relocations from vmlinux. + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include $(srctree)/scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@.relocs + cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@ + +quiet_cmd_strip_relocs = RSTRIP $@ + cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@ + +# `@true` prevents complaint when there is nothing to be done + +vmlinux: FORCE + @true +ifeq ($(CONFIG_X86_NEED_RELOCS),y) + $(call cmd,relocs) + $(call cmd,strip_relocs) +endif + +%.ko: FORCE + @true + +clean: + @rm -f vmlinux.relocs + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 25805199a506..b2968175fc27 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only relocs vmlinux.bin.all -vmlinux.relocs vmlinux.lds mkpiggy piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1acff356d97a..d995595394bb 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = arch/x86/tools/relocs -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE - $(call if_changed,relocs) +# vmlinux.relocs is created by the vmlinux postlink step. +vmlinux.relocs: vmlinux + @true vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..18021e8c0c28 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,52 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MNATIVE_INTEL +#define MODULE_PROC_FAMILY "NATIVE_INTEL " +#elif defined CONFIG_MNATIVE_AMD +#define MODULE_PROC_FAMILY "NATIVE_AMD " +#elif defined CONFIG_MNEHALEM +#define MODULE_PROC_FAMILY "NEHALEM " +#elif defined CONFIG_MWESTMERE +#define MODULE_PROC_FAMILY "WESTMERE " +#elif defined CONFIG_MSILVERMONT +#define MODULE_PROC_FAMILY "SILVERMONT " +#elif defined CONFIG_MGOLDMONT +#define MODULE_PROC_FAMILY "GOLDMONT " +#elif defined CONFIG_MGOLDMONTPLUS +#define MODULE_PROC_FAMILY "GOLDMONTPLUS " +#elif defined CONFIG_MSANDYBRIDGE +#define MODULE_PROC_FAMILY "SANDYBRIDGE " +#elif defined CONFIG_MIVYBRIDGE +#define MODULE_PROC_FAMILY "IVYBRIDGE " +#elif defined CONFIG_MHASWELL +#define MODULE_PROC_FAMILY "HASWELL " +#elif defined CONFIG_MBROADWELL +#define MODULE_PROC_FAMILY "BROADWELL " +#elif defined CONFIG_MSKYLAKE +#define MODULE_PROC_FAMILY "SKYLAKE " +#elif defined CONFIG_MSKYLAKEX +#define MODULE_PROC_FAMILY "SKYLAKEX " +#elif defined CONFIG_MCANNONLAKE +#define MODULE_PROC_FAMILY "CANNONLAKE " +#elif defined CONFIG_MICELAKE +#define MODULE_PROC_FAMILY "ICELAKE " +#elif defined CONFIG_MCASCADELAKE +#define MODULE_PROC_FAMILY "CASCADELAKE " +#elif defined CONFIG_MCOOPERLAKE +#define MODULE_PROC_FAMILY "COOPERLAKE " +#elif defined CONFIG_MTIGERLAKE +#define MODULE_PROC_FAMILY "TIGERLAKE " +#elif defined CONFIG_MSAPPHIRERAPIDS +#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " +#elif defined CONFIG_ROCKETLAKE +#define MODULE_PROC_FAMILY "ROCKETLAKE " +#elif defined CONFIG_MALDERLAKE +#define MODULE_PROC_FAMILY "ALDERLAKE " +#elif defined CONFIG_MRAPTORLAKE +#define MODULE_PROC_FAMILY "RAPTORLAKE " +#elif defined CONFIG_MMETEORLAKE +#define MODULE_PROC_FAMILY "METEORLAKE " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -35,6 +81,32 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MK8SSE3 +#define MODULE_PROC_FAMILY "K8SSE3 " +#elif defined CONFIG_MK10 +#define MODULE_PROC_FAMILY "K10 " +#elif defined CONFIG_MBARCELONA +#define MODULE_PROC_FAMILY "BARCELONA " +#elif defined CONFIG_MBOBCAT +#define MODULE_PROC_FAMILY "BOBCAT " +#elif defined CONFIG_MBULLDOZER +#define MODULE_PROC_FAMILY "BULLDOZER " +#elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "PILEDRIVER " +#elif defined CONFIG_MSTEAMROLLER +#define MODULE_PROC_FAMILY "STEAMROLLER " +#elif defined CONFIG_MJAGUAR +#define MODULE_PROC_FAMILY "JAGUAR " +#elif defined CONFIG_MEXCAVATOR +#define MODULE_PROC_FAMILY "EXCAVATOR " +#elif defined CONFIG_MZEN +#define MODULE_PROC_FAMILY "ZEN " +#elif defined CONFIG_MZEN2 +#define MODULE_PROC_FAMILY "ZEN2 " +#elif defined CONFIG_MZEN3 +#define MODULE_PROC_FAMILY "ZEN3 " +#elif defined CONFIG_MZEN4 +#define MODULE_PROC_FAMILY "ZEN4 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/drivers/Makefile b/drivers/Makefile index bdf1c66141c9..1e1a0832fb48 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -59,15 +59,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb and intelfb depend on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -79,6 +72,14 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb and intelfb depend on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index a7bfddf08fa7..c9a5fa597950 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -229,6 +229,15 @@ config I2C_CHT_WC combined with a FUSB302 Type-C port-controller as such it is advised to also select CONFIG_TYPEC_FUSB302=m. +config I2C_NCT6775 + tristate "Nuvoton NCT6775 and compatible SMBus controller" + help + If you say yes to this option, support will be included for the + Nuvoton NCT6775 and compatible SMBus controllers. + + This driver can also be built as a module. If so, the module + will be called i2c-nct6775. + config I2C_NFORCE2 tristate "Nvidia nForce2, nForce3 and nForce4" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index e73cdb1d2b5a..052ccd05c13c 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o obj-$(CONFIG_I2C_I801) += i2c-i801.o obj-$(CONFIG_I2C_ISCH) += i2c-isch.o obj-$(CONFIG_I2C_ISMT) += i2c-ismt.o +obj-$(CONFIG_I2C_NCT6775) += i2c-nct6775.o obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o obj-$(CONFIG_I2C_NFORCE2_S4985) += i2c-nforce2-s4985.o obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c new file mode 100644 index 000000000000..0462f0952043 --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6775.c @@ -0,0 +1,647 @@ +/* + * i2c-nct6775 - Driver for the SMBus master functionality of + * Nuvoton NCT677x Super-I/O chips + * + * Copyright (C) 2019 Adam Honse + * + * Derived from nct6775 hwmon driver + * Copyright (C) 2012 Guenter Roeck + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRVNAME "i2c-nct6775" + +/* Nuvoton SMBus address offsets */ +#define SMBHSTDAT (0 + nuvoton_nct6793d_smba) +#define SMBBLKSZ (1 + nuvoton_nct6793d_smba) +#define SMBHSTCMD (2 + nuvoton_nct6793d_smba) +#define SMBHSTIDX (3 + nuvoton_nct6793d_smba) //Index field is the Command field on other controllers +#define SMBHSTCTL (4 + nuvoton_nct6793d_smba) +#define SMBHSTADD (5 + nuvoton_nct6793d_smba) +#define SMBHSTERR (9 + nuvoton_nct6793d_smba) +#define SMBHSTSTS (0xE + nuvoton_nct6793d_smba) + +/* Command register */ +#define NCT6793D_READ_BYTE 0 +#define NCT6793D_READ_WORD 1 +#define NCT6793D_READ_BLOCK 2 +#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3 +#define NCT6793D_PROC_CALL 4 +#define NCT6793D_WRITE_BYTE 8 +#define NCT6793D_WRITE_WORD 9 +#define NCT6793D_WRITE_BLOCK 10 + +/* Control register */ +#define NCT6793D_MANUAL_START 128 +#define NCT6793D_SOFT_RESET 64 + +/* Error register */ +#define NCT6793D_NO_ACK 32 + +/* Status register */ +#define NCT6793D_FIFO_EMPTY 1 +#define NCT6793D_FIFO_FULL 2 +#define NCT6793D_MANUAL_ACTIVE 4 + +#define NCT6775_LD_SMBUS 0x0B + +/* Other settings */ +#define MAX_RETRIES 400 + +enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793, + nct6795, nct6796, nct6798 }; + +struct nct6775_sio_data { + int sioreg; + enum kinds kind; +}; + +/* used to set data->name = nct6775_device_names[data->sio_kind] */ +static const char * const nct6775_device_names[] = { + "nct6106", + "nct6775", + "nct6776", + "nct6779", + "nct6791", + "nct6792", + "nct6793", + "nct6795", + "nct6796", + "nct6798", +}; + +static const char * const nct6775_sio_names[] __initconst = { + "NCT6106D", + "NCT6775F", + "NCT6776D/F", + "NCT6779D", + "NCT6791D", + "NCT6792D", + "NCT6793D", + "NCT6795D", + "NCT6796D", + "NCT6798D", +}; + +#define SIO_REG_LDSEL 0x07 /* Logical device select */ +#define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ +#define SIO_REG_SMBA 0x62 /* SMBus base address register */ + +#define SIO_NCT6106_ID 0xc450 +#define SIO_NCT6775_ID 0xb470 +#define SIO_NCT6776_ID 0xc330 +#define SIO_NCT6779_ID 0xc560 +#define SIO_NCT6791_ID 0xc800 +#define SIO_NCT6792_ID 0xc910 +#define SIO_NCT6793_ID 0xd120 +#define SIO_NCT6795_ID 0xd350 +#define SIO_NCT6796_ID 0xd420 +#define SIO_NCT6798_ID 0xd428 +#define SIO_ID_MASK 0xFFF0 + +static inline void +superio_outb(int ioreg, int reg, int val) +{ + outb(reg, ioreg); + outb(val, ioreg + 1); +} + +static inline int +superio_inb(int ioreg, int reg) +{ + outb(reg, ioreg); + return inb(ioreg + 1); +} + +static inline void +superio_select(int ioreg, int ld) +{ + outb(SIO_REG_LDSEL, ioreg); + outb(ld, ioreg + 1); +} + +static inline int +superio_enter(int ioreg) +{ + /* + * Try to reserve and for exclusive access. + */ + if (!request_muxed_region(ioreg, 2, DRVNAME)) + return -EBUSY; + + outb(0x87, ioreg); + outb(0x87, ioreg); + + return 0; +} + +static inline void +superio_exit(int ioreg) +{ + outb(0xaa, ioreg); + outb(0x02, ioreg); + outb(0x02, ioreg + 1); + release_region(ioreg, 2); +} + +/* + * ISA constants + */ + +#define IOREGION_ALIGNMENT (~7) +#define IOREGION_LENGTH 2 +#define ADDR_REG_OFFSET 0 +#define DATA_REG_OFFSET 1 + +#define NCT6775_REG_BANK 0x4E +#define NCT6775_REG_CONFIG 0x40 + +static struct i2c_adapter *nct6775_adapter; + +struct i2c_nct6775_adapdata { + unsigned short smba; +}; + +/* Return negative errno on error. */ +static s32 nct6775_access(struct i2c_adapter * adap, u16 addr, + unsigned short flags, char read_write, + u8 command, int size, union i2c_smbus_data * data) +{ + struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); + unsigned short nuvoton_nct6793d_smba = adapdata->smba; + int i, len, cnt; + union i2c_smbus_data tmp_data; + int timeout = 0; + + tmp_data.word = 0; + cnt = 0; + len = 0; + + outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL); + + switch (size) { + case I2C_SMBUS_QUICK: + outb_p((addr << 1) | read_write, + SMBHSTADD); + break; + case I2C_SMBUS_BYTE_DATA: + tmp_data.byte = data->byte; + case I2C_SMBUS_BYTE: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + outb_p(tmp_data.byte, SMBHSTDAT); + outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD); + } + else { + outb_p(NCT6793D_READ_BYTE, SMBHSTCMD); + } + break; + case I2C_SMBUS_WORD_DATA: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + outb_p(data->word & 0xff, SMBHSTDAT); + outb_p((data->word & 0xff00) >> 8, SMBHSTDAT); + outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD); + } + else { + outb_p(NCT6793D_READ_WORD, SMBHSTCMD); + } + break; + case I2C_SMBUS_BLOCK_DATA: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + len = data->block[0]; + if (len == 0 || len > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + outb_p(len, SMBBLKSZ); + + cnt = 1; + if (len >= 4) { + for (i = cnt; i <= 4; i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len -= 4; + cnt += 4; + } + else { + for (i = cnt; i <= len; i++ ) { + outb_p(data->block[i], SMBHSTDAT); + } + + len = 0; + } + + outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD); + } + else { + return -ENOTSUPP; + } + break; + default: + dev_warn(&adap->dev, "Unsupported transaction %d\n", size); + return -EOPNOTSUPP; + } + + outb_p(NCT6793D_MANUAL_START, SMBHSTCTL); + + while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) { + if (read_write == I2C_SMBUS_WRITE) { + timeout = 0; + while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0) + { + if(timeout > MAX_RETRIES) + { + return -ETIMEDOUT; + } + usleep_range(250, 500); + timeout++; + } + + //Load more bytes into FIFO + if (len >= 4) { + for (i = cnt; i <= (cnt + 4); i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len -= 4; + cnt += 4; + } + else { + for (i = cnt; i <= (cnt + len); i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len = 0; + } + } + else { + return -ENOTSUPP; + } + + } + + //wait for manual mode to complete + timeout = 0; + while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0) + { + if(timeout > MAX_RETRIES) + { + return -ETIMEDOUT; + } + usleep_range(250, 500); + timeout++; + } + + if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) { + return -ENXIO; + } + else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) { + return 0; + } + + switch (size) { + case I2C_SMBUS_QUICK: + case I2C_SMBUS_BYTE_DATA: + data->byte = inb_p(SMBHSTDAT); + break; + case I2C_SMBUS_WORD_DATA: + data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8); + break; + } + return 0; +} + +static u32 nct6775_func(struct i2c_adapter *adapter) +{ + return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | + I2C_FUNC_SMBUS_BLOCK_DATA; +} + +static const struct i2c_algorithm smbus_algorithm = { + .smbus_xfer = nct6775_access, + .functionality = nct6775_func, +}; + +static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap) +{ + struct i2c_adapter *adap; + struct i2c_nct6775_adapdata *adapdata; + int retval; + + adap = kzalloc(sizeof(*adap), GFP_KERNEL); + if (adap == NULL) { + return -ENOMEM; + } + + adap->owner = THIS_MODULE; + adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adap->algo = &smbus_algorithm; + + adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL); + if (adapdata == NULL) { + kfree(adap); + return -ENOMEM; + } + + adapdata->smba = smba; + + snprintf(adap->name, sizeof(adap->name), + "SMBus NCT67xx adapter%s at %04x", name, smba); + + i2c_set_adapdata(adap, adapdata); + + retval = i2c_add_adapter(adap); + if (retval) { + kfree(adapdata); + kfree(adap); + return retval; + } + + *padap = adap; + return 0; +} + +static void nct6775_remove_adapter(struct i2c_adapter *adap) +{ + struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); + + if (adapdata->smba) { + i2c_del_adapter(adap); + kfree(adapdata); + kfree(adap); + } +} + +//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume); + +/* + * when Super-I/O functions move to a separate file, the Super-I/O + * bus will manage the lifetime of the device and this module will only keep + * track of the nct6775 driver. But since we use platform_device_alloc(), we + * must keep track of the device + */ +static struct platform_device *pdev[2]; + +static int nct6775_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6775_sio_data *sio_data = dev_get_platdata(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH, + DRVNAME)) + return -EBUSY; + + switch (sio_data->kind) { + case nct6791: + case nct6792: + case nct6793: + case nct6795: + case nct6796: + case nct6798: + nct6775_add_adapter(res->start, "", &nct6775_adapter); + break; + default: + return -ENODEV; + } + + return 0; +} +/* +static void nct6791_enable_io_mapping(int sioaddr) +{ + int val; + + val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE); + if (val & 0x10) { + pr_info("Enabling hardware monitor logical device mappings.\n"); + superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE, + val & ~0x10); + } +}*/ + +static struct platform_driver i2c_nct6775_driver = { + .driver = { + .name = DRVNAME, +// .pm = &nct6775_dev_pm_ops, + }, + .probe = nct6775_probe, +}; + +static void __exit i2c_nct6775_exit(void) +{ + int i; + + if(nct6775_adapter) + nct6775_remove_adapter(nct6775_adapter); + + for (i = 0; i < ARRAY_SIZE(pdev); i++) { + if (pdev[i]) + platform_device_unregister(pdev[i]); + } + platform_driver_unregister(&i2c_nct6775_driver); +} + +/* nct6775_find() looks for a '627 in the Super-I/O config space */ +static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data) +{ + u16 val; + int err; + int addr; + + err = superio_enter(sioaddr); + if (err) + return err; + + val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) | + superio_inb(sioaddr, SIO_REG_DEVID + 1); + + switch (val & SIO_ID_MASK) { + case SIO_NCT6106_ID: + sio_data->kind = nct6106; + break; + case SIO_NCT6775_ID: + sio_data->kind = nct6775; + break; + case SIO_NCT6776_ID: + sio_data->kind = nct6776; + break; + case SIO_NCT6779_ID: + sio_data->kind = nct6779; + break; + case SIO_NCT6791_ID: + sio_data->kind = nct6791; + break; + case SIO_NCT6792_ID: + sio_data->kind = nct6792; + break; + case SIO_NCT6793_ID: + sio_data->kind = nct6793; + break; + case SIO_NCT6795_ID: + sio_data->kind = nct6795; + break; + case SIO_NCT6796_ID: + sio_data->kind = nct6796; + break; + case SIO_NCT6798_ID: + sio_data->kind = nct6798; + break; + default: + if (val != 0xffff) + pr_debug("unsupported chip ID: 0x%04x\n", val); + superio_exit(sioaddr); + return -ENODEV; + } + + /* We have a known chip, find the SMBus I/O address */ + superio_select(sioaddr, NCT6775_LD_SMBUS); + val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8) + | superio_inb(sioaddr, SIO_REG_SMBA + 1); + addr = val & IOREGION_ALIGNMENT; + if (addr == 0) { + pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n"); + superio_exit(sioaddr); + return -ENODEV; + } + + //if (sio_data->kind == nct6791 || sio_data->kind == nct6792 || + // sio_data->kind == nct6793 || sio_data->kind == nct6795 || + // sio_data->kind == nct6796) + // nct6791_enable_io_mapping(sioaddr); + + superio_exit(sioaddr); + pr_info("Found %s or compatible chip at %#x:%#x\n", + nct6775_sio_names[sio_data->kind], sioaddr, addr); + sio_data->sioreg = sioaddr; + + return addr; +} + +static int __init i2c_nct6775_init(void) +{ + int i, err; + bool found = false; + int address; + struct resource res; + struct nct6775_sio_data sio_data; + int sioaddr[2] = { 0x2e, 0x4e }; + + err = platform_driver_register(&i2c_nct6775_driver); + if (err) + return err; + + /* + * initialize sio_data->kind and sio_data->sioreg. + * + * when Super-I/O functions move to a separate file, the Super-I/O + * driver will probe 0x2e and 0x4e and auto-detect the presence of a + * nct6775 hardware monitor, and call probe() + */ + for (i = 0; i < ARRAY_SIZE(pdev); i++) { + address = nct6775_find(sioaddr[i], &sio_data); + if (address <= 0) + continue; + + found = true; + + pdev[i] = platform_device_alloc(DRVNAME, address); + if (!pdev[i]) { + err = -ENOMEM; + goto exit_device_unregister; + } + + err = platform_device_add_data(pdev[i], &sio_data, + sizeof(struct nct6775_sio_data)); + if (err) + goto exit_device_put; + + memset(&res, 0, sizeof(res)); + res.name = DRVNAME; + res.start = address; + res.end = address + IOREGION_LENGTH - 1; + res.flags = IORESOURCE_IO; + + err = acpi_check_resource_conflict(&res); + if (err) { + platform_device_put(pdev[i]); + pdev[i] = NULL; + continue; + } + + err = platform_device_add_resources(pdev[i], &res, 1); + if (err) + goto exit_device_put; + + /* platform_device_add calls probe() */ + err = platform_device_add(pdev[i]); + if (err) + goto exit_device_put; + } + if (!found) { + err = -ENODEV; + goto exit_unregister; + } + + return 0; + +exit_device_put: + platform_device_put(pdev[i]); +exit_device_unregister: + while (--i >= 0) { + if (pdev[i]) + platform_device_unregister(pdev[i]); + } +exit_unregister: + platform_driver_unregister(&i2c_nct6775_driver); + return err; +} + +MODULE_AUTHOR("Adam Honse "); +MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips"); +MODULE_LICENSE("GPL"); + +module_init(i2c_nct6775_init); +module_exit(i2c_nct6775_exit); diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index 809fbd014cd6..d54b35b147ee 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */ usleep_range(2000, 2100); else - usleep_range(250, 500); + usleep_range(25, 50); while ((++timeout < MAX_TIMEOUT) && ((temp = inb_p(SMBHSTSTS)) & 0x01)) - usleep_range(250, 500); + usleep_range(25, 50); /* If the SMBus is still busy, we give up */ if (timeout == MAX_TIMEOUT) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ee269b1d09fa..bd401d6de438 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3211,6 +3211,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } +#ifdef CONFIG_CACHY + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); +#endif + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); if (ret < 0) goto bad; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 494fa46f5767..cdf0c12590da 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; } +static bool acs_on_downstream; +static bool acs_on_multifunction; + +#define NUM_ACS_IDS 16 +struct acs_on_id { + unsigned short vendor; + unsigned short device; +}; +static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; +static u8 max_acs_id; + +static __init int pcie_acs_override_setup(char *p) +{ + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "downstream", 10)) + acs_on_downstream = true; + if (!strncmp(p, "multifunction", 13)) + acs_on_multifunction = true; + if (!strncmp(p, "id:", 3)) { + char opt[5]; + int ret; + long val; + + if (max_acs_id >= NUM_ACS_IDS - 1) { + pr_warn("Out of PCIe ACS override slots (%d)\n", + NUM_ACS_IDS); + goto next; + } + + p += 3; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].vendor = val; + + p += strcspn(p, ":"); + if (*p != ':') { + pr_warn("PCIe ACS invalid ID\n"); + goto next; + } + + p++; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].device = val; + max_acs_id++; + } +next: + p += strcspn(p, ","); + if (*p == ',') + p++; + } + + if (acs_on_downstream || acs_on_multifunction || max_acs_id) + pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); + + return 0; +} +early_param("pcie_acs_override", pcie_acs_override_setup); + +static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) +{ + int i; + + /* Never override ACS for legacy devices or devices with ACS caps */ + if (!pci_is_pcie(dev) || + pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) + return -ENOTTY; + + for (i = 0; i < max_acs_id; i++) + if (acs_on_ids[i].vendor == dev->vendor && + acs_on_ids[i].device == dev->device) + return 1; + + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_DOWNSTREAM: + case PCI_EXP_TYPE_ROOT_PORT: + if (acs_on_downstream) + return 1; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_LEG_END: + case PCI_EXP_TYPE_RC_END: + if (acs_on_multifunction && dev->multifunction) + return 1; + } + + return -ENOTTY; +} /* * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be * prevented for those affected devices. @@ -5002,6 +5102,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, /* Wangxun nics */ { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, + /* ACS override */ + { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1396a839dd8a..823a5ca9a3b3 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1100,6 +1100,20 @@ config WINMATE_FM07_KEYS buttons below the display. This module adds an input device that delivers key events when these buttons are pressed. +config STEAMDECK + tristate "Valve Steam Deck platform driver" + depends on X86_64 + help + Driver exposing various bits and pieces of functionality + provided by Steam Deck specific VLV0100 device presented by + EC firmware. This includes but not limited to: + - CPU/device's fan control + - Read-only access to DDIC registers + - Battery tempreature measurements + - Various display related control knobs + - USB Type-C connector event notification + Say N unless you are running on a Steam Deck. + endif # X86_PLATFORM_DEVICES config P2SB diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 1d3d1b02541b..75b30a3face9 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -134,3 +134,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC) += simatic-ipc.o # Winmate obj-$(CONFIG_WINMATE_FM07_KEYS) += winmate-fm07-keys.o + +# Steam Deck +obj-$(CONFIG_STEAMDECK) += steamdeck.o diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c new file mode 100644 index 000000000000..77a6677ec19e --- /dev/null +++ b/drivers/platform/x86/steamdeck.c @@ -0,0 +1,523 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Steam Deck ACPI platform driver + * + * Copyright (C) 2021-2022 Valve Corporation + * + */ +#include +#include +#include +#include +#include + +#define ACPI_STEAMDECK_NOTIFY_STATUS 0x80 + +/* 0 - port connected, 1 -port disconnected */ +#define ACPI_STEAMDECK_PORT_CONNECT BIT(0) +/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */ +#define ACPI_STEAMDECK_CUR_DATA_ROLE BIT(3) +/* + * Debouncing delay to allow negotiation process to settle. 2s value + * was arrived at via trial and error. + */ +#define STEAMDECK_ROLE_SWITCH_DELAY (msecs_to_jiffies(2000)) + +struct steamdeck { + struct acpi_device *adev; + struct device *hwmon; + void *regmap; + long fan_target; + struct delayed_work role_work; + struct extcon_dev *edev; + struct device *dev; +}; + +static ssize_t +steamdeck_simple_store(struct device *dev, const char *buf, size_t count, + const char *method, + unsigned long upper_limit) +{ + struct steamdeck *fan = dev_get_drvdata(dev); + unsigned long value; + + if (kstrtoul(buf, 10, &value) || value >= upper_limit) + return -EINVAL; + + if (ACPI_FAILURE(acpi_execute_simple_method(fan->adev->handle, + (char *)method, value))) + return -EIO; + + return count; +} + +#define STEAMDECK_ATTR_WO(_name, _method, _upper_limit) \ + static ssize_t _name##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ + { \ + return steamdeck_simple_store(dev, buf, count, \ + _method, \ + _upper_limit); \ + } \ + static DEVICE_ATTR_WO(_name) + +STEAMDECK_ATTR_WO(target_cpu_temp, "STCT", U8_MAX / 2); +STEAMDECK_ATTR_WO(gain, "SGAN", U16_MAX); +STEAMDECK_ATTR_WO(ramp_rate, "SFRR", U8_MAX); +STEAMDECK_ATTR_WO(hysteresis, "SHTS", U16_MAX); +STEAMDECK_ATTR_WO(maximum_battery_charge_rate, "CHGR", U16_MAX); +STEAMDECK_ATTR_WO(recalculate, "SCHG", U16_MAX); + +STEAMDECK_ATTR_WO(led_brightness, "CHBV", U8_MAX); +STEAMDECK_ATTR_WO(content_adaptive_brightness, "CABC", U8_MAX); +STEAMDECK_ATTR_WO(gamma_set, "GAMA", U8_MAX); +STEAMDECK_ATTR_WO(display_brightness, "WDBV", U8_MAX); +STEAMDECK_ATTR_WO(ctrl_display, "WCDV", U8_MAX); +STEAMDECK_ATTR_WO(cabc_minimum_brightness, "WCMB", U8_MAX); +STEAMDECK_ATTR_WO(memory_data_access_control, "MDAC", U8_MAX); + +#define STEAMDECK_ATTR_WO_NOARG(_name, _method) \ + static ssize_t _name##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ + { \ + struct steamdeck *fan = dev_get_drvdata(dev); \ + \ + if (ACPI_FAILURE(acpi_evaluate_object(fan->adev->handle, \ + _method, NULL, NULL))) \ + return -EIO; \ + \ + return count; \ + } \ + static DEVICE_ATTR_WO(_name) + +STEAMDECK_ATTR_WO_NOARG(power_cycle_display, "DPCY"); +STEAMDECK_ATTR_WO_NOARG(display_normal_mode_on, "NORO"); +STEAMDECK_ATTR_WO_NOARG(display_inversion_off, "INOF"); +STEAMDECK_ATTR_WO_NOARG(display_inversion_on, "INON"); +STEAMDECK_ATTR_WO_NOARG(idle_mode_on, "WRNE"); + +#define STEAMDECK_ATTR_RO(_name, _method) \ + static ssize_t _name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct steamdeck *jup = dev_get_drvdata(dev); \ + unsigned long long val; \ + \ + if (ACPI_FAILURE(acpi_evaluate_integer( \ + jup->adev->handle, \ + _method, NULL, &val))) \ + return -EIO; \ + \ + return sprintf(buf, "%llu\n", val); \ + } \ + static DEVICE_ATTR_RO(_name) + +STEAMDECK_ATTR_RO(firmware_version, "PDFW"); +STEAMDECK_ATTR_RO(board_id, "BOID"); +STEAMDECK_ATTR_RO(pdcs, "PDCS"); + +static umode_t +steamdeck_is_visible(struct kobject *kobj, struct attribute *attr, int index) +{ + return attr->mode; +} + +static struct attribute *steamdeck_attributes[] = { + &dev_attr_target_cpu_temp.attr, + &dev_attr_gain.attr, + &dev_attr_ramp_rate.attr, + &dev_attr_hysteresis.attr, + &dev_attr_maximum_battery_charge_rate.attr, + &dev_attr_recalculate.attr, + &dev_attr_power_cycle_display.attr, + + &dev_attr_led_brightness.attr, + &dev_attr_content_adaptive_brightness.attr, + &dev_attr_gamma_set.attr, + &dev_attr_display_brightness.attr, + &dev_attr_ctrl_display.attr, + &dev_attr_cabc_minimum_brightness.attr, + &dev_attr_memory_data_access_control.attr, + + &dev_attr_display_normal_mode_on.attr, + &dev_attr_display_inversion_off.attr, + &dev_attr_display_inversion_on.attr, + &dev_attr_idle_mode_on.attr, + + &dev_attr_firmware_version.attr, + &dev_attr_board_id.attr, + &dev_attr_pdcs.attr, + + NULL +}; + +static const struct attribute_group steamdeck_group = { + .attrs = steamdeck_attributes, + .is_visible = steamdeck_is_visible, +}; + +static const struct attribute_group *steamdeck_groups[] = { + &steamdeck_group, + NULL +}; + +static int steamdeck_read_fan_speed(struct steamdeck *jup, long *speed) +{ + unsigned long long val; + + if (ACPI_FAILURE(acpi_evaluate_integer(jup->adev->handle, + "FANR", NULL, &val))) + return -EIO; + + *speed = val; + return 0; +} + +static int +steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *out) +{ + struct steamdeck *sd = dev_get_drvdata(dev); + unsigned long long val; + + switch (type) { + case hwmon_temp: + if (attr != hwmon_temp_input) + return -EOPNOTSUPP; + + if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle, + "BATT", NULL, &val))) + return -EIO; + /* + * Assuming BATT returns deg C we need to mutiply it + * by 1000 to convert to mC + */ + *out = val * 1000; + break; + case hwmon_fan: + switch (attr) { + case hwmon_fan_input: + return steamdeck_read_fan_speed(sd, out); + case hwmon_fan_target: + *out = sd->fan_target; + break; + case hwmon_fan_fault: + if (ACPI_FAILURE(acpi_evaluate_integer( + sd->adev->handle, + "FANC", NULL, &val))) + return -EIO; + /* + * FANC (Fan check): + * 0: Abnormal + * 1: Normal + */ + *out = !val; + break; + default: + return -EOPNOTSUPP; + } + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int +steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, const char **str) +{ + switch (type) { + case hwmon_temp: + *str = "Battery Temp"; + break; + case hwmon_fan: + *str = "System Fan"; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int +steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + struct steamdeck *sd = dev_get_drvdata(dev); + + if (type != hwmon_fan || + attr != hwmon_fan_target) + return -EOPNOTSUPP; + + if (val > U16_MAX) + return -EINVAL; + + sd->fan_target = val; + + if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle, + "FANS", val))) + return -EIO; + + return 0; +} + +static umode_t +steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type, + u32 attr, int channel) +{ + if (type == hwmon_fan && + attr == hwmon_fan_target) + return 0644; + + return 0444; +} + +static const struct hwmon_channel_info *steamdeck_info[] = { + HWMON_CHANNEL_INFO(temp, + HWMON_T_INPUT | HWMON_T_LABEL), + HWMON_CHANNEL_INFO(fan, + HWMON_F_INPUT | HWMON_F_LABEL | + HWMON_F_TARGET | HWMON_F_FAULT), + NULL +}; + +static const struct hwmon_ops steamdeck_hwmon_ops = { + .is_visible = steamdeck_hwmon_is_visible, + .read = steamdeck_hwmon_read, + .read_string = steamdeck_hwmon_read_string, + .write = steamdeck_hwmon_write, +}; + +static const struct hwmon_chip_info steamdeck_chip_info = { + .ops = &steamdeck_hwmon_ops, + .info = steamdeck_info, +}; + +#define STEAMDECK_STA_OK \ + (ACPI_STA_DEVICE_ENABLED | \ + ACPI_STA_DEVICE_PRESENT | \ + ACPI_STA_DEVICE_FUNCTIONING) + +static int +steamdeck_ddic_reg_read(void *context, unsigned int reg, unsigned int *val) +{ + union acpi_object obj = { .type = ACPI_TYPE_INTEGER }; + struct acpi_object_list arg_list = { .count = 1, .pointer = &obj, }; + struct steamdeck *sd = context; + unsigned long long _val; + + obj.integer.value = reg; + + if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle, + "RDDI", &arg_list, &_val))) + return -EIO; + + *val = _val; + return 0; +} + +static int steamdeck_read_pdcs(struct steamdeck *sd, unsigned long long *pdcs) +{ + acpi_status status; + + status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs); + if (ACPI_FAILURE(status)) { + dev_err(sd->dev, "PDCS evaluation failed: %s\n", + acpi_format_exception(status)); + return -EIO; + } + + return 0; +} + +static void steamdeck_usb_role_work(struct work_struct *work) +{ + struct steamdeck *sd = + container_of(work, struct steamdeck, role_work.work); + unsigned long long pdcs; + bool usb_host; + + if (steamdeck_read_pdcs(sd, &pdcs)) + return; + + /* + * We only care about these two + */ + pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE; + + /* + * For "connect" events our role is determined by a bit in + * PDCS, for "disconnect" we switch to being a gadget + * unconditionally. The thinking for the latter is we don't + * want to start acting as a USB host until we get + * confirmation from the firmware that we are a USB host + */ + usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ? + pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false; + + WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST, + usb_host)); + dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device"); +} + +static void steamdeck_notify(acpi_handle handle, u32 event, void *context) +{ + struct device *dev = context; + struct steamdeck *sd = dev_get_drvdata(dev); + unsigned long long pdcs; + unsigned long delay; + + switch (event) { + case ACPI_STEAMDECK_NOTIFY_STATUS: + if (steamdeck_read_pdcs(sd, &pdcs)) + return; + /* + * We process "disconnect" events immediately and + * "connect" events with a delay to give the HW time + * to settle. For example attaching USB hub (at least + * for HW used for testing) will generate intermediary + * event with "host" bit not set, followed by the one + * that does have it set. + */ + delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ? + STEAMDECK_ROLE_SWITCH_DELAY : 0; + + queue_delayed_work(system_long_wq, &sd->role_work, delay); + break; + default: + dev_err(dev, "Unsupported event [0x%x]\n", event); + } +} + +static void steamdeck_remove_notify_handler(void *data) +{ + struct steamdeck *sd = data; + + acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY, + steamdeck_notify); + cancel_delayed_work_sync(&sd->role_work); +} + +static const unsigned int steamdeck_extcon_cable[] = { + EXTCON_USB, + EXTCON_USB_HOST, + EXTCON_CHG_USB_SDP, + EXTCON_CHG_USB_CDP, + EXTCON_CHG_USB_DCP, + EXTCON_CHG_USB_ACA, + EXTCON_NONE, +}; + +static int steamdeck_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct steamdeck *sd; + acpi_status status; + unsigned long long sta; + int ret; + + static const struct regmap_config regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 255, + .cache_type = REGCACHE_NONE, + .reg_read = steamdeck_ddic_reg_read, + }; + + sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL); + if (!sd) + return -ENOMEM; + sd->adev = ACPI_COMPANION(&pdev->dev); + sd->dev = dev; + platform_set_drvdata(pdev, sd); + INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work); + + status = acpi_evaluate_integer(sd->adev->handle, "_STA", + NULL, &sta); + if (ACPI_FAILURE(status)) { + dev_err(dev, "Status check failed (0x%x)\n", status); + return -EINVAL; + } + + if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) { + dev_err(dev, "Device is not ready\n"); + return -EINVAL; + } + + /* + * Our ACPI interface doesn't expose a method to read current + * fan target, so we use current fan speed as an + * approximation. + */ + if (steamdeck_read_fan_speed(sd, &sd->fan_target)) + dev_warn(dev, "Failed to read fan speed"); + + sd->hwmon = devm_hwmon_device_register_with_info(dev, + "steamdeck", + sd, + &steamdeck_chip_info, + steamdeck_groups); + if (IS_ERR(sd->hwmon)) { + dev_err(dev, "Failed to register HWMON device"); + return PTR_ERR(sd->hwmon); + } + + sd->regmap = devm_regmap_init(dev, NULL, sd, ®map_config); + if (IS_ERR(sd->regmap)) + dev_err(dev, "Failed to register REGMAP"); + + sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable); + if (IS_ERR(sd->edev)) + return -ENOMEM; + + ret = devm_extcon_dev_register(dev, sd->edev); + if (ret < 0) { + dev_err(dev, "Failed to register extcon device: %d\n", ret); + return ret; + } + + /* + * Set initial role value + */ + queue_delayed_work(system_long_wq, &sd->role_work, 0); + flush_delayed_work(&sd->role_work); + + status = acpi_install_notify_handler(sd->adev->handle, + ACPI_DEVICE_NOTIFY, + steamdeck_notify, + dev); + if (ACPI_FAILURE(status)) { + dev_err(dev, "Error installing ACPI notify handler\n"); + return -EIO; + } + + ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler, + sd); + return ret; +} + +static const struct acpi_device_id steamdeck_device_ids[] = { + { "VLV0100", 0 }, + { "", 0 }, +}; +MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids); + +static struct platform_driver steamdeck_driver = { + .probe = steamdeck_probe, + .driver = { + .name = "steamdeck", + .acpi_match_table = steamdeck_device_ids, + }, +}; +module_platform_driver(steamdeck_driver); + +MODULE_AUTHOR("Andrey Smirnov "); +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 29e1f9e76eb6..a7852e22101f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1178,7 +1178,7 @@ struct readahead_control { ._index = i, \ } -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) +#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 45f09bec02c4..87b20e2ee274 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +extern int unprivileged_userns_clone; + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) @@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else +#define unprivileged_userns_clone 0 + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index db762e35aca9..0336791656eb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -194,6 +194,7 @@ struct netns_ipv4 { int sysctl_udp_rmem_min; u8 sysctl_fib_notify_on_flag_change; + unsigned int sysctl_tcp_collapse_max_bytes; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_udp_l3mdev_accept; diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 901b440238d5..7026df84a0f6 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); +DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), diff --git a/init/Kconfig b/init/Kconfig index 44e90b28a30f..748a9491ca12 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK menu "General setup" +config CACHY + bool "Some kernel tweaks by CachyOS" + default y + config BROKEN bool @@ -348,6 +352,19 @@ config KERNEL_UNCOMPRESSED endchoice +menu "ZSTD compression options" + depends on KERNEL_ZSTD + +config ZSTD_COMPRESSION_LEVEL + int "Compression level (1-22)" + range 1 22 + default "22" + help + Choose a compression level for zstd kernel compression. + Default is 22, which is the maximum. + +endmenu + config DEFAULT_INIT string "Default init path" default "" @@ -1253,6 +1270,22 @@ config USER_NS If unsure, say N. +config USER_NS_UNPRIVILEGED + bool "Allow unprivileged users to create namespaces" + default y + depends on USER_NS + help + When disabled, unprivileged users will not be able to create + new namespaces. Allowing users to create their own namespaces + has been part of several recent local privilege escalation + exploits, so if you need user namespaces but are + paranoid^Wsecurity-conscious you want to disable this. + + This setting can be overridden at runtime via the + kernel.unprivileged_userns_clone sysctl. + + If unsure, say Y. + config PID_NS bool "PID Namespaces" default y @@ -1420,6 +1453,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. +config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. + config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (-Os)" help diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..0f78364efd4f 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -40,6 +40,27 @@ choice on SMP and NUMA systems and exactly dividing by both PAL and NTSC frame rates for video and multimedia work. + config HZ_500 + bool "500 HZ" + help + 500 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_600 + bool "600 HZ" + help + 600 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_750 + bool "750 HZ" + help + 750 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + config HZ_1000 bool "1000 HZ" help @@ -53,6 +74,9 @@ config HZ default 100 if HZ_100 default 250 if HZ_250 default 300 if HZ_300 + default 500 if HZ_500 + default 600 if HZ_600 + default 750 if HZ_750 default 1000 if HZ_1000 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c index e8808ffbea61..6d3ce6c20cf9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -98,6 +98,10 @@ #include #include +#ifdef CONFIG_USER_NS +#include +#endif + #include #include #include @@ -2033,6 +2037,10 @@ static __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -3183,6 +3191,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { + err = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto bad_unshare_out; + } + err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 424b3bc58f3f..ecf2798c5ccf 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD endchoice +menu "ZSTD module compression options" + depends on MODULE_COMPRESS_ZSTD + +config MODULE_COMPRESS_ZSTD_LEVEL + int "Compression level (1-19)" + range 1 19 + default 9 + help + Compression level used by zstd for compressing modules. + +config MODULE_COMPRESS_ZSTD_ULTRA + bool "Enable ZSTD ultra compression" + help + Compress modules with ZSTD using the highest possible compression. + +config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA + int "Compression level (20-22)" + depends on MODULE_COMPRESS_ZSTD_ULTRA + range 20 22 + default 20 + help + Ultra compression level used by zstd for compressing modules. + +endmenu + config MODULE_DECOMPRESS bool "Support in-kernel module decompression" depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index ab62074174c3..f1f909bdc30d 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -280,9 +280,9 @@ config RCU_NOCB_CPU_CB_BOOST depends on RCU_NOCB_CPU && RCU_BOOST default y if PREEMPT_RT help - Use this option to invoke offloaded callbacks as SCHED_FIFO + Use this option to invoke offloaded callbacks as SCHED_RR to avoid starvation by heavy SCHED_OTHER background load. - Of course, running as SCHED_FIFO during callback floods will + Of course, running as SCHED_RR during callback floods will cause the rcuo[ps] kthreads to monopolize the CPU for hundreds of milliseconds or more. Therefore, when enabling this option, it is your responsibility to ensure that latency-sensitive diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 634df26a2c27..8c54871cc0a0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2406,7 +2406,7 @@ static int rcutorture_booster_init(unsigned int cpu) t = per_cpu(ksoftirqd, cpu); WARN_ON_ONCE(!t); sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } /* Don't allow time recalculation while creating a new task. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821a..80cf9824d461 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4443,8 +4443,8 @@ static void __init rcu_start_exp_gp_kworkers(void) return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR, ¶m); } @@ -4482,7 +4482,7 @@ static int __init rcu_spawn_gp_kthread(void) return 0; if (kthread_prio) { sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 9e1c8caec5ce..dd39c50ae099 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1465,7 +1465,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1476,7 +1476,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) goto end; if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7b0fe741a088..77ad9e033358 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) struct sched_param sp; sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(current, SCHED_RR, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ WRITE_ONCE(rdp->rcuc_activity, jiffies); @@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ out: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 661226e38835..735994022fe0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ * * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_latency = 3000000ULL; +static unsigned int normalized_sysctl_sched_latency = 3000000ULL; +#else unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL; - +#endif /* * The initial- and re-scaling of tunables is configurable * @@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_min_granularity = 400000ULL; +static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL; +#else unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +#endif /* * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. @@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_wakeup_granularity = 500000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; +#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +#endif const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_CACHY +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1c240d2c99bc..98e1a7472fd2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif +#ifdef CONFIG_USER_NS +#include +#endif static const int ngroups_max = NGROUPS_MAX; @@ -1645,6 +1648,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_USER_NS + { + .procname = "unprivileged_userns_clone", + .data = &unprivileged_userns_clone, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 54211dbd516c..16ca0c151629 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ #include #include +/* sysctl */ +#ifdef CONFIG_USER_NS_UNPRIVILEGED +int unprivileged_userns_clone = 1; +#else +int unprivileged_userns_clone; +#endif + static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); diff --git a/mm/Kconfig b/mm/Kconfig index ff7b209dec05..bf317c39ed2d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -602,7 +602,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || CACHY default 1 # diff --git a/mm/compaction.c b/mm/compaction.c index 8238e83385a7..d0b16a5b30f7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2717,7 +2717,11 @@ static void compact_nodes(void) * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ +#ifdef CONFIG_CACHY +unsigned int __read_mostly sysctl_compaction_proactiveness; +#else unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +#endif int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ad608ef2a243..178cfd5490b1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -71,7 +71,11 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ +#ifdef CONFIG_CACHY +static int dirty_background_ratio = 5; +#else static int dirty_background_ratio = 10; +#endif /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ +#ifdef CONFIG_CACHY +unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */ +#else unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +#endif EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/swap.c b/mm/swap.c index 70e2063ef43a..79ab9b1c3910 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1134,6 +1134,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); */ void __init swap_setup(void) { +#ifdef CONFIG_CACHY + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ @@ -1145,4 +1149,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif } diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b52644771cc4..11a4b0e3b583 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ +#ifdef CONFIG_CACHY +static const unsigned int vmpressure_level_med = 65; +#else static const unsigned int vmpressure_level_med = 60; +#endif static const unsigned int vmpressure_level_critical = 95; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 5b7b8d4f5297..160acbbdf111 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -190,7 +190,11 @@ struct scan_control { /* * From 0 .. 200. Higher means more swappy. */ +#ifdef CONFIG_CACHY +int vm_swappiness = 20; +#else int vm_swappiness = 60; +#endif static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) @@ -4559,7 +4563,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_CACHY +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 40fe70fc2015..3028e27897d9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1470,6 +1470,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_plb_max_cong_thresh, }, + { + .procname = "tcp_collapse_max_bytes", + .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 754e0212c951..b6d7faeb737a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5414,6 +5414,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); @@ -5425,6 +5426,39 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; + /* For context and additional information about this patch, see the + * blog post at + * + * sysctl: net.ipv4.tcp_collapse_max_bytes + * + * If tcp_collapse_max_bytes is non-zero, attempt to collapse the + * queue to free up memory if the current amount of memory allocated + * is less than tcp_collapse_max_bytes. Otherwise, the packet is + * dropped without attempting to collapse the queue. + * + * If tcp_collapse_max_bytes is zero, this feature is disabled + * and the default Linux behavior is used. The default Linux + * behavior is to always perform the attempt to collapse the + * queue to free up memory. + * + * When the receive queue is small, we want to collapse the + * queue. There are two reasons for this: (a) the latency of + * performing the collapse will be small on a small queue, and + * (b) we want to avoid sending a congestion signal (via a + * packet drop) to the sender when the receive queue is small. + * + * The result is that we avoid latency spikes caused by the + * time it takes to perform the collapse logic when the receive + * queue is large and full, while preserving existing behavior + * and performance for all other cases. + */ + if (net->ipv4.sysctl_tcp_collapse_max_bytes && + (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { + /* We are dropping the packet */ + trace_tcp_collapse_max_bytes_exceeded(sk); + goto do_not_collapse; + } + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, @@ -5443,6 +5477,8 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; +do_not_collapse: + /* If we are really being abused, tell the caller to silently * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 339a9cea9047..34bd711a1e7c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3274,6 +3274,8 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; + net->ipv4.sysctl_tcp_collapse_max_bytes = 0; + return 0; } diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 4a4a5f67c1a6..993e4578c0f2 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -557,14 +557,21 @@ quiet_cmd_xzmisc = XZMISC $@ # decompression is used, like initramfs decompression, zstd22 should likely not # be used because it would require zstd to allocate a 128 MB buffer. +ifdef CONFIG_ZSTD_COMPRESSION_LEVEL +zstd_comp_val := $(CONFIG_ZSTD_COMPRESSION_LEVEL) +ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0) +zstd_comp_val += --ultra +endif +endif + quiet_cmd_zstd = ZSTD $@ - cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@ + cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@ quiet_cmd_zstd22 = ZSTD22 $@ - cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@ + cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@ quiet_cmd_zstd22_with_size = ZSTD22 $@ - cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ + cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@ # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 4815a8e32227..6a3c36713045 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP $@ cmd_gzip = $(KGZIP) -n -f $< quiet_cmd_xz = XZ $@ cmd_xz = $(XZ) --lzma2=dict=2MiB -f $< +ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA quiet_cmd_zstd = ZSTD $@ - cmd_zstd = $(ZSTD) -T0 --rm -f -q $< + cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $< +else +quiet_cmd_zstd = ZSTD $@ + cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $< +endif $(dst)/%.ko.gz: $(dst)/%.ko FORCE $(call cmd,gzip) -- 2.40.0 From 5e20c6e278ccac0b0774a4644c2aacdd924d3584 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 10 Mar 2023 18:00:48 +0100 Subject: [PATCH 05/15] clr Signed-off-by: Peter Jung --- arch/x86/kernel/tsc.c | 3 + arch/x86/mm/fault.c | 4 +- drivers/cpufreq/intel_pstate.c | 7 + drivers/idle/intel_idle.c | 50 ++-- drivers/input/serio/i8042.c | 10 +- drivers/net/dummy.c | 2 +- drivers/pci/pci.c | 2 +- drivers/powercap/intel_rapl_common.c | 2 +- drivers/thermal/intel/intel_powerclamp.c | 10 + fs/xattr.c | 15 +- include/linux/jbd2.h | 2 +- include/linux/rcuref.h | 89 +++++++ include/linux/types.h | 6 + include/linux/wait.h | 2 + include/net/dst.h | 21 +- include/net/sock.h | 2 +- include/uapi/linux/if_bonding.h | 2 +- init/do_mounts.c | 16 +- kernel/locking/rwsem.c | 4 +- kernel/sched/wait.c | 24 ++ kernel/watchdog.c | 2 +- lib/Makefile | 2 +- lib/raid6/algos.c | 4 +- lib/rcuref.c | 311 +++++++++++++++++++++++ mm/ksm.c | 11 +- net/bridge/br_nf_core.c | 2 +- net/core/dst.c | 26 +- net/core/rtnetlink.c | 2 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp.c | 4 +- net/ipv6/route.c | 6 +- net/netfilter/ipvs/ip_vs_xmit.c | 4 +- 32 files changed, 559 insertions(+), 90 deletions(-) create mode 100644 include/linux/rcuref.h create mode 100644 lib/rcuref.c diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a78e73da4a74..bab8a98080cf 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void) if (!constant_tsc || !mask) return 0; + if (cpu != 0) + return cpu_data(0).loops_per_jiffy; + sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8..1a14f52added 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", loglvl, tsk->comm, task_pid_nr(tsk), address, - (void *)regs->ip, (void *)regs->sp, error_code); + (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); print_vma_addr(KERN_CONT " in ", regs->ip); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index fd73d6d2b808..0c0071ab3966 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -366,6 +366,13 @@ static void intel_pstate_set_itmt_prio(int cpu) * update them at any time after it has been called. */ sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); + /* + * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. + * In this case we can't use CPPC.highest_perf to enable ITMT. + * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. + */ + if (cppc_perf.highest_perf == 0xff) + cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); if (max_highest_perf <= min_highest_perf) { if (cppc_perf.highest_perf > max_highest_perf) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cfeb24d40d37..8d1945afa973 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, - .target_residency = 100, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 1500, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 5000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 40, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 4000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 7000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 70, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 85, - .target_residency = 200, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x33", .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 124, - .target_residency = 800, + .target_residency = 3000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 200, - .target_residency = 800, + .target_residency = 3200, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 480, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 890, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 300, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 4, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 170, - .target_residency = 600, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -987,7 +987,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 2, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 6dac7c1853a5..fab04cd8a7a0 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; - pr_err("Failed to enable KBD port\n"); + pr_info("Failed to enable KBD port\n"); return -EIO; } @@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; - pr_err("Failed to enable AUX port\n"); + pr_info("Failed to enable AUX port\n"); return -EIO; } @@ -732,7 +732,7 @@ static int i8042_check_mux(void) i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - pr_err("Failed to disable AUX port, can't use MUX\n"); + pr_info("Failed to disable AUX port, can't use MUX\n"); return -EIO; } @@ -955,7 +955,7 @@ static int i8042_controller_selftest(void) do { if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { - pr_err("i8042 controller selftest timeout\n"); + pr_info("i8042 controller selftest timeout\n"); return -ENODEV; } @@ -977,7 +977,7 @@ static int i8042_controller_selftest(void) pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else - pr_err("i8042 controller selftest failed\n"); + pr_info("i8042 controller selftest failed\n"); return -EIO; #endif } diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index c4b1b0aa438a..06b00f7a8eab 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -43,7 +43,7 @@ #define DRV_NAME "dummy" -static int numdummies = 1; +static int numdummies = 0; /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7a67611dc5f4..48b350fe09d8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -62,7 +62,7 @@ struct pci_pme_device { struct pci_dev *dev; }; -#define PME_TIMEOUT 1000 /* How long between PME checks */ +#define PME_TIMEOUT 4000 /* How long between PME checks */ static void pci_dev_d3_sleep(struct pci_dev *dev) { diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 26d00b1853b4..3e239d6548b5 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1518,7 +1518,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (!id) { - pr_err("driver does not support CPU family %d model %d\n", + pr_info("driver does not support CPU family %d model %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 2f4cbfdf26a0..2d297a1cfa34 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -636,6 +636,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .set_cur_state = powerclamp_set_cur_state, }; +static const struct x86_cpu_id amd_cpu[] = { + { X86_VENDOR_AMD }, + {}, +}; + static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), {} @@ -645,6 +650,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { + if (x86_match_cpu(amd_cpu)){ + pr_info("Intel PowerClamp does not support AMD CPUs\n"); + return -ENODEV; + } + if (!x86_match_cpu(intel_powerclamp_ids)) { pr_err("CPU does not support MWAIT\n"); return -ENODEV; diff --git a/fs/xattr.c b/fs/xattr.c index adab9a70b536..4ada829a3b1b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -139,16 +139,17 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, } /* - * In the user.* namespace, only regular files and directories can have - * extended attributes. For sticky directories, only the owner and - * privileged users can write attributes. + * In the user.* namespace, only regular files, symbolic links, and + * directories can have extended attributes. For symbolic links and + * sticky directories, only the owner and privileged users can write + * attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && - !inode_owner_or_capable(mnt_userns, inode)) + if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) + || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) + && !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2170e0cc279d..e8fa79f5bb34 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -45,7 +45,7 @@ /* * The default maximum commit age, in seconds. */ -#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 +#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 #ifdef CONFIG_JBD2_DEBUG /* diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h new file mode 100644 index 000000000000..57ffb3c02ace --- /dev/null +++ b/include/linux/rcuref.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_RCUREF_H +#define _LINUX_RCUREF_H + +#include +#include +#include +#include +#include +#include + +#define RCUREF_NOREF 0x00000000 +#define RCUREF_ONEREF 0x00000001 +#define RCUREF_MAXREF 0x7FFFFFFF +#define RCUREF_SATURATED 0xA0000000 +#define RCUREF_RELEASED 0xC0000000 +#define RCUREF_DEAD 0xE0000000 + +/** + * rcuref_init - Initialize a rcuref reference count with the given reference count + * @ref: Pointer to the reference count + * @cnt: The initial reference count typically '1' + */ +static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) +{ + atomic_set(&ref->refcnt, cnt); +} + +/** + * rcuref_read - Read the number of held reference counts of a rcuref + * @ref: Pointer to the reference count + * + * Return: The number of held references (0 ... N) + */ +static inline unsigned int rcuref_read(rcuref_t *ref) +{ + unsigned int c = atomic_read(&ref->refcnt); + + /* Return 0 if within the DEAD zone. */ + return c >= RCUREF_RELEASED ? 0 : c; +} + +extern __must_check bool rcuref_get_slowpath(rcuref_t *ref, unsigned int new); + +/** + * rcuref_get - Acquire one reference on a rcuref reference count + * @ref: Pointer to the reference count + * + * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See documentation in lib/rcuref.c + * + * Return: + * False if the attempt to acquire a reference failed. This happens + * when the last reference has been put already + * + * True if a reference was successfully acquired + */ +static inline __must_check bool rcuref_get(rcuref_t *ref) +{ + /* + * Unconditionally increase the reference count. The saturation and + * dead zones provide enough tolerance for this. + */ + unsigned int old = atomic_fetch_add_relaxed(1, &ref->refcnt); + + /* + * If the old value is less than RCUREF_MAXREF, this is a valid + * reference. + * + * In case the original value was RCUREF_NOREF the above + * unconditional increment raced with a concurrent put() operation + * dropping the last reference. That racing put() operation + * subsequently fails to mark the reference count dead because the + * count is now elevated again and the concurrent caller is + * therefore not allowed to deconstruct the object. + */ + if (likely(old < RCUREF_MAXREF)) + return true; + + /* Handle the cases inside the saturation and dead zones */ + return rcuref_get_slowpath(ref, old); +} + +extern __must_check bool rcuref_put(rcuref_t *ref); + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index ea8cf60a8a79..419baa980529 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -175,6 +175,12 @@ typedef struct { } atomic64_t; #endif +typedef struct { + atomic_t refcnt; +} rcuref_t; + +#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i) } + struct list_head { struct list_head *next, *prev; }; diff --git a/include/linux/wait.h b/include/linux/wait.h index a0307b516b09..edc21128f387 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); @@ -1192,6 +1193,7 @@ do { \ */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/include/net/dst.h b/include/net/dst.h index d67fda89cd0f..0909a3306902 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -65,19 +66,29 @@ struct dst_entry { * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT - atomic_t __refcnt; /* 64-bit offset 64 */ + rcuref_t __refcnt; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; - struct lwtunnel_state *lwtstate; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT - atomic_t __refcnt; /* 32-bit offset 64 */ + struct lwtunnel_state *lwtstate; + rcuref_t __refcnt; /* 32-bit offset 64 */ #endif netdevice_tracker dev_tracker; +#ifdef CONFIG_64BIT + /* + * Ensure that lwtstate is not in the same cache line as __refcnt, + * because that would lead to false sharing under high contention + * of __refcnt. This also ensures that rtable::rt_genid is not + * sharing the same cache-line. + */ + int pad2[6]; + struct lwtunnel_state *lwtstate; +#endif }; struct dst_metrics { @@ -228,7 +239,7 @@ static inline void dst_hold(struct dst_entry *dst) * the placement of __refcnt in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); - WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); + WARN_ON(!rcuref_get(&dst->__refcnt)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) @@ -292,7 +303,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb */ static inline bool dst_hold_safe(struct dst_entry *dst) { - return atomic_inc_not_zero(&dst->__refcnt); + return rcuref_get(&dst->__refcnt); } /** diff --git a/include/net/sock.h b/include/net/sock.h index c6584a352463..dbf85161c0c7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2159,7 +2159,7 @@ sk_dst_get(struct sock *sk) rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + if (dst && !rcuref_get(&dst->__refcnt)) dst = NULL; rcu_read_unlock(); return dst; diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837d..bf8e2af101a3 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -82,7 +82,7 @@ #define BOND_STATE_ACTIVE 0 /* link is active */ #define BOND_STATE_BACKUP 1 /* link is backup */ -#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ +#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 811e94daf0a8..06fef7f97c02 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name) if (strcmp(name, "/dev/ram") == 0) return Root_RAM0; #ifdef CONFIG_BLOCK - if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9); + if (strncmp(name, "PARTUUID=", 9) == 0) { + dev_t res; + int needtowait = 40<<1; + res = devt_from_partuuid(name + 9); + while (!res && needtowait) { + /* waiting 0.5 sec */ + msleep(500); + res = devt_from_partuuid(name + 9); + needtowait--; + } + return res; + } if (strncmp(name, "PARTLABEL=", 10) == 0) return devt_from_partlabel(name + 10); if (strncmp(name, "/dev/", 5) == 0) @@ -612,7 +622,9 @@ void __init prepare_namespace(void) * For example, it is not atypical to wait 5 seconds here * for the touchpad of a laptop to initialize. */ + async_synchronize_full(); wait_for_device_probe(); + async_synchronize_full(); md_run_setup(); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 84d5b649b95f..e341ca8731f7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -754,6 +754,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; + int i = 0; lockdep_assert_preemption_disabled(); @@ -790,7 +791,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - cpu_relax(); + if (i++ > 1000) + cpu_relax(); } return state; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 133b74730738..1647fb8662eb 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -293,6 +304,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); + set_current_state(state); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { wq_entry->flags = flags; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8e61f21e7e33..be1439d38f26 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; -int __read_mostly watchdog_thresh = 10; +int __read_mostly watchdog_thresh = 40; static int __read_mostly nmi_watchdog_available; struct cpumask watchdog_cpumask __read_mostly; diff --git a/lib/Makefile b/lib/Makefile index 4d9461bfea42..71c9627153b8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ list_sort.o uuid.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o rhashtable.o base64.o \ - once.o refcount.o usercopy.o errseq.o bucket_locks.o \ + once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o obj-$(CONFIG_STRING_SELFTEST) += test_string.o obj-y += string_helpers.o diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index a22a05c9af8a..a70bcbbd1673 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -126,8 +126,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) for (best = NULL, algo = raid6_recov_algos; *algo; algo++) if (!best || (*algo)->priority > best->priority) - if (!(*algo)->valid || (*algo)->valid()) + if (!(*algo)->valid || (*algo)->valid()) { best = *algo; + break; + } if (best) { raid6_2data_recov = best->data2; diff --git a/lib/rcuref.c b/lib/rcuref.c new file mode 100644 index 000000000000..34fa40618fca --- /dev/null +++ b/lib/rcuref.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * rcuref - A scalable reference count implementation for RCU managed objects + * + * rcuref is provided to replace open coded reference count implementations + * based on atomic_t. It protects explicitely RCU managed objects which can + * be visible even after the last reference has been dropped and the object + * is heading towards destruction. + * + * A common usage pattern is: + * + * get() + * rcu_read_lock(); + * p = get_ptr(); + * if (p && !atomic_inc_not_zero(&p->refcnt)) + * p = NULL; + * rcu_read_unlock(); + * return p; + * + * put() + * if (!atomic_dec_return(&->refcnt)) { + * remove_ptr(p); + * kfree_rcu((p, rcu); + * } + * + * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has + * O(N^2) behaviour under contention with N concurrent operations. + * + * rcuref uses atomic_fetch_add_relaxed() and atomic_fetch_sub_release() + * for the fast path, which scale better under contention. + * + * Why not refcount? + * ================= + * + * In principle it should be possible to make refcount use the rcuref + * scheme, but the destruction race described below cannot be prevented + * unless the protected object is RCU managed. + * + * Theory of operation + * =================== + * + * rcuref uses an unsigned integer reference counter. As long as the + * counter value is greater than or equal to RCUREF_ONEREF and not larger + * than RCUREF_MAXREF the reference is alive: + * + * NOREF ONEREF MAXREF SATURATED RELEASED DEAD + * 0 1 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF + * <---valid ------------> <-------saturation zone-------> <-----------dead zone----------> + * + * The get() and put() operations do unconditional increments and + * decrements. The result is checked after the operation. This optimizes + * for the fast path. + * + * If the reference count is saturated or dead, then the increments and + * decrements are not harmful as the reference count still stays in the + * respective zones and is always set back to STATURATED resp. DEAD. The + * zones have room for 2^28 racing operations in each direction, which + * makes it practically impossible to escape the zones. + * + * Once the last reference is dropped the reference count becomes + * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The + * slowpath then tries to set the reference count from RCUREF_NOREF to + * RCUREF_DEAD via a cmpxchg(). This opens a small window where a + * concurrent rcuref_get() can acquire the reference count and bring it + * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. + * + * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in + * DEAD + 1, which is inside the dead zone. If that happens the reference + * count is put back to DEAD. + * + * The actual race is possible due to the unconditional increment and + * decrements in rcuref_get() and rcuref_put(): + * + * T1 T2 + * get() put() + * if (atomic_fetch_sub(1, &ref->refcnt) >= 0) + * succeeds-> atomic_try_cmpxchg(&ref->refcnt, -1, DEAD); + * + * old = atomic_fetch_add(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 + * + * As @old observed by T1 is within the dead zone the T1 get() fails. + * + * Possible critical states: + * + * Context Counter References Operation + * T1 1 1 init() + * T2 2 2 get() + * T1 1 1 put() + * T2 0 0 put() tries to mark dead + * T1 1 1 get() + * T2 1 1 put() mark dead fails + * T1 0 0 put() tries to mark dead + * T1 DEAD 0 put() mark dead succeeds + * T2 DEAD+1 0 get() fails and puts it back to DEAD + * + * Of course there are more complex scenarios, but the above illustrates + * the working principle. The rest is left to the imagination of the + * reader. + * + * Deconstruction race + * =================== + * + * The release operation must be protected by prohibiting a grace period in + * order to prevent a possible use after free: + * + * T1 T2 + * put() get() + * // ref->refcnt = ONEREF + * if (atomic_fetch_sub(1, &ref->cnt) > ONEREF) + * return false; <- Not taken + * + * // ref->refcnt == NOREF + * --> preemption + * // Elevates ref->c to ONEREF + * if (!atomic_fetch_add(1, &ref->refcnt) >= NOREF) + * return true; <- taken + * + * if (put(&p->ref)) { <-- Succeeds + * remove_pointer(p); + * kfree_rcu(p, rcu); + * } + * + * RCU grace period ends, object is freed + * + * atomic_cmpxchg(&ref->refcnt, NONE, DEAD); <- UAF + * + * This is prevented by disabling preemption around the put() operation as + * that's in most kernel configurations cheaper than a rcu_read_lock() / + * rcu_read_unlock() pair and in many cases even a NOOP. In any case it + * prevents the grace period which keeps the object alive until all put() + * operations complete. + * + * Saturation protection + * ===================== + * + * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). + * Once this is exceedded the reference count becomes stale by setting it + * to RCUREF_SATURATED, which will cause a memory leak, but it prevents + * wrap arounds which obviously cause worse problems than a memory + * leak. When saturation is reached a warning is emitted. + * + * Race conditions + * =============== + * + * All reference count increment/decrement operations are unconditional and + * only verified after the fact. This optimizes for the good case and takes + * the occasional race vs. a dead or already saturated refcount into + * account. The saturation and dead zones are large enough to accomodate + * for that. + * + * Memory ordering + * =============== + * + * Memory ordering rules are slightly relaxed wrt regular atomic_t functions + * and provide only what is strictly required for refcounts. + * + * The increments are fully relaxed; these will not provide ordering. The + * rationale is that whatever is used to obtain the object to increase the + * reference count on will provide the ordering. For locked data + * structures, its the lock acquire, for RCU/lockless data structures its + * the dependent load. + * + * rcuref_get() provides a control dependency ordering future stores which + * ensures that the object is not modified when acquiring a reference + * fails. + * + * rcuref_put() provides release order, i.e. all prior loads and stores + * will be issued before. It also provides a control dependency ordering + * against the subsequent destruction of the object. + * + * If rcuref_put() successfully dropped the last reference and marked the + * object DEAD it also provides acquire ordering. + */ + +#include +#include + +/** + * rcuref_get_slowpath - Slowpath of rcuref_get() + * @ref: Pointer to the reference count + * @old: The reference count before the unconditional increment + * operation in rcuref_get() + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * False if the reference count was already marked dead + * + * True if the reference count is saturated, which prevents the + * object from being deconstructed ever. + */ +bool rcuref_get_slowpath(rcuref_t *ref, unsigned int old) +{ + /* + * If the reference count was already marked dead, undo the + * increment so it stays in the middle of the dead zone and return + * fail. + */ + if (old >= RCUREF_RELEASED) { + atomic_set(&ref->refcnt, RCUREF_DEAD); + return false; + } + + /* + * If it was saturated, warn and mark it so. In case the increment + * was already on a saturated value restore the saturation + * marker. This keeps it in the middle of the saturation zone and + * prevents the reference count from overflowing. This leaks the + * object memory, but prevents the obvious reference count overflow + * damage. + */ + WARN_ONCE(old >= RCUREF_MAXREF, "rcuref saturated - leaking memory"); + atomic_set(&ref->refcnt, RCUREF_SATURATED); + return true; +} +EXPORT_SYMBOL_GPL(rcuref_get_slowpath); + +static __must_check bool __rcuref_put(rcuref_t *ref) +{ + /* + * Unconditionally decrement the reference count. The saturation and + * dead zones provide enough tolerance for this. + */ + unsigned int old = atomic_fetch_sub_release(1, &ref->refcnt); + + /* + * If the old value is in the valid range and is greater than + * RCUREF_ONEREF, nothing to do. + */ + if (likely(old > RCUREF_ONEREF && old <= RCUREF_MAXREF)) + return false; + + /* Did this drop the last reference? */ + if (likely(old == RCUREF_ONEREF)) { + /* + * Carefully try to set the reference count to RCUREF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(old >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { + atomic_set(&ref->refcnt, RCUREF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + atomic_set(&ref->refcnt, RCUREF_SATURATED); + return false; +} + +/** + * rcuref_put -- Release one reference for a rcuref reference count + * @ref: Pointer to the reference count + * + * Can be invoked from any context. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Return: + * + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool rcuref_put(rcuref_t *ref) +{ + bool released; + + /* + * Protect against a concurrent get()/put() pair which marks the + * reference count DEAD and schedules it for RCU free. This + * prevents a grace period and is cheaper than + * rcu_read_lock()/unlock(). + */ + preempt_disable(); + released = __rcuref_put(ref); + preempt_enable(); + return released; +} +EXPORT_SYMBOL_GPL(rcuref_put); diff --git a/mm/ksm.c b/mm/ksm.c index 83450d32e109..739814145818 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2461,9 +2461,14 @@ static int ksm_scan_thread(void *nothing) if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); - wait_event_interruptible_timeout(ksm_iter_wait, - sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), - msecs_to_jiffies(sleep_ms)); + if (sleep_ms >= 1000) + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(round_jiffies_relative(sleep_ms))); + else + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 8c69f0c95a8e..c2b628e3cc7f 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; - atomic_set(&rt->dst.__refcnt, 1); + rcuref_init(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; diff --git a/net/core/dst.c b/net/core/dst.c index 6d2dd03dafa8..750440803883 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->tclassid = 0; #endif dst->lwtstate = NULL; - atomic_set(&dst->__refcnt, initial_ref); + rcuref_init(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; @@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put); void dst_release(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); - } + if (dst && rcuref_put(&dst->__refcnt)) + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - dst_destroy(dst); - } + if (dst && rcuref_put(&dst->__refcnt)) + dst_destroy(dst); } EXPORT_SYMBOL(dst_release_immediate); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f5114b2395ae..6a7be9d3dd10 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; - ci.rta_clntref = atomic_read(&dst->__refcnt); + ci.rta_clntref = rcuref_read(&dst->__refcnt); } if (expires) { unsigned long clock; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f2c43f67187d..9885bfb429a2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -606,7 +606,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, + prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e9e8040d6491..f9b56123b3b8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4815,8 +4815,8 @@ void __init tcp_init(void) tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_wshare = min(16UL*1024*1024, limit); + max_rshare = min(16UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a6983a13dd20..8b5e3d57b08d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = { static const struct rt6_info ip6_null_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__refcnt = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, @@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = { static const struct rt6_info ip6_prohibit_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__refcnt = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, @@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__refcnt = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 029171379884..bc9dc51828f7 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__refcnt)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; @@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest_dst->dst_saddr.in6, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__refcnt)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.in6; -- 2.40.0 From c0762e6bc0c17ce7f3d4c6d75626d574266d4f02 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 13 Apr 2023 18:16:34 +0200 Subject: [PATCH 06/15] fixes Signed-off-by: Peter Jung --- Documentation/ABI/stable/sysfs-block | 10 + .../testing/sysfs-class-led-trigger-blkdev | 78 ++ Documentation/admin-guide/mm/ksm.rst | 7 + Documentation/leds/index.rst | 1 + Documentation/leds/ledtrig-blkdev.rst | 158 +++ Documentation/x86/topology.rst | 26 + arch/x86/boot/compressed/Makefile | 2 +- arch/x86/events/rapl.c | 20 +- arch/x86/include/asm/cacheinfo.h | 1 + arch/x86/kernel/cpu/amd.c | 1 + arch/x86/kernel/cpu/cacheinfo.c | 36 + arch/x86/kernel/cpu/hygon.c | 1 + arch/x86/mm/tlb.c | 2 +- arch/x86/net/bpf_jit_comp.c | 5 +- drivers/bluetooth/btusb.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 3 +- .../drm/amd/display/dc/bios/bios_parser2.c | 7 +- .../drm/amd/display/dc/dcn20/dcn20_resource.c | 2 +- .../drm/amd/display/dc/dcn21/dcn21_resource.c | 2 +- .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 3 +- drivers/gpu/drm/scheduler/sched_main.c | 3 +- drivers/leds/trigger/Kconfig | 9 + drivers/leds/trigger/Makefile | 1 + drivers/leds/trigger/ledtrig-blkdev.c | 1221 +++++++++++++++++ .../net/wireless/mediatek/mt76/mt7921/init.c | 7 +- fs/eventpoll.c | 188 ++- fs/proc/base.c | 1 + include/linux/mm_types.h | 7 +- include/linux/pageblock-flags.h | 2 +- kernel/kheaders.c | 10 +- kernel/kthread.c | 5 + kernel/padata.c | 4 +- lib/string.c | 10 +- mm/compaction.c | 75 +- mm/internal.h | 6 +- mm/ksm.c | 185 ++- mm/page_alloc.c | 22 +- mm/z3fold.c | 2 - mm/zsmalloc.c | 3 - scripts/Makefile.vmlinux_o | 2 +- sound/pci/hda/cs35l41_hda.c | 2 +- .../selftests/vm/ksm_functional_tests.c | 96 +- 42 files changed, 2041 insertions(+), 187 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev create mode 100644 Documentation/leds/ledtrig-blkdev.rst create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index cd14ecb3c9a5..ad47337ac75a 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: devices that support receiving integrity metadata. +What: /sys/block//linked_leds +Date: January 2023 +Contact: Ian Pilcher +Description: + Directory that contains symbolic links to all LEDs that + are associated with (linked to) this block device by the + blkdev LED trigger. Only present when at least one LED + is linked. (See Documentation/leds/ledtrig-blkdev.rst.) + + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev new file mode 100644 index 000000000000..28ce8c814fb7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev @@ -0,0 +1,78 @@ +What: /sys/class/leds//blink_time +Date: January 2023 +Contact: Ian Pilcher +Description: + Time (in milliseconds) that the LED will be on during a single + "blink". + +What: /sys/class/leds//check_interval +Date: January 2023 +Contact: Ian Pilcher +Description: + Interval (in milliseconds) between checks of the block devices + linked to this LED. The LED will be blinked if the correct type + of activity (see blink_on_{read,write,discard,flush} attributes) + has occurred on any of the linked devices since the previous + check. + +What: /sys/class/leds//blink_on_read +Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to read activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_write +Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to write activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_discard +Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to discard activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_flush +Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to cache flush activity on any of its linked block devices. + +What: /sys/class/leds//link_dev_by_path +Date: January 2023 +Contact: Ian Pilcher +Description: + Associate a block device with this LED by writing the path to + the device special file (e.g. /dev/sda) to this attribute. + Symbolic links are followed. + +What: /sys/class/leds//unlink_dev_by_path +Date: January 2023 +Contact: Ian Pilcher +Description: + Remove the association between this LED and a block device by + writing the path to the device special file (e.g. /dev/sda) to + this attribute. Symbolic links are followed. + +What: /sys/class/leds//unlink_dev_by_name +Date: January 2023 +Contact: Ian Pilcher +Description: + Remove the association between this LED and a block device by + writing the kernel name of the device (e.g. sda) to this + attribute. + +What: /sys/class/leds//linked_devices +Date: January 2023 +Contact: Ian Pilcher +Description: + Directory containing links to all block devices that are + associated with this LED. (Note that the names of the + symbolic links in this directory are *kernel* names, which + may not match the device special file paths written to + link_device and unlink_device.) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index fb6ba2002a4b..f160f9487a90 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains the number of KSM pages that hit the ``max_page_sharing`` limit stable_node_dups number of duplicated KSM pages +zero_pages_sharing + how many empty pages are sharing kernel zero page(s) instead of + with each other as it would happen normally. Only effective when + enabling ``use_zero_pages`` knob. + +When enabling ``use_zero_pages``, the sum of ``pages_sharing`` + +``zero_pages_sharing`` represents how much really saved by KSM. A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst index e5d63b940045..e3c24e468cbc 100644 --- a/Documentation/leds/index.rst +++ b/Documentation/leds/index.rst @@ -10,6 +10,7 @@ LEDs leds-class leds-class-flash leds-class-multicolor + ledtrig-blkdev ledtrig-oneshot ledtrig-transient ledtrig-usbport diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst new file mode 100644 index 000000000000..9ff5b99de451 --- /dev/null +++ b/Documentation/leds/ledtrig-blkdev.rst @@ -0,0 +1,158 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================= +Block Device (blkdev) LED Trigger +================================= + +Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or +``CONFIG_LEDS_TRIGGER_BLKDEV=m``. + +See also: + +* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev`` +* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block//linked_leds``) + +Overview +======== + +.. note:: + The examples below use ```` to refer to the name of a + system-specific LED. If no suitable LED is available on a test + system (in a virtual machine, for example), it is possible to + use a userspace LED. (See ``Documentation/leds/uleds.rst``.) + +Verify that the ``blkdev`` LED trigger is available:: + + # grep blkdev /sys/class/leds//trigger + ... rfkill-none blkdev + +(If the previous command produces no output, you may need to load the trigger +module - ``modprobe ledtrig_blkdev``. If the module is not available, check +the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.) + +Associate the LED with the ``blkdev`` LED trigger:: + + # echo blkdev > /sys/class/leds//trigger + + # cat /sys/class/leds//trigger + ... rfkill-none [blkdev] + +Note that several new device attributes are available in the +``/sys/class/leds/`` directory. + +* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are + used to manage the set of block devices associated with this LED. The LED + will blink when activity occurs on any of its linked devices. + +* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and + ``blink_on_flush`` are boolean values that determine whether the LED will + blink when a particular type of activity is detected on one of its linked + block devices. + +* ``blink_time`` is the duration (in milliseconds) of each blink of this LED. + (The minimum value is 10 milliseconds.) + +* ``check_interval`` is the frequency (in milliseconds) with which block devices + linked to this LED will be checked for activity and the LED blinked (if the + correct type of activity has occurred). + +* The ``linked_devices`` directory will contain a symbolic link to every device + that is associated with this LED. + +Link a block device to the LED:: + + # echo /dev/sda > /sys/class/leds//link_dev_by_path + + # ls /sys/class/leds//linked_devices + sda + +(The value written to ``link_dev_by_path`` must be the path of the device +special file, such as ``/dev/sda``, that represents the block device - or the +path of a symbolic link to such a device special file.) + +Activity on the device will now cause the LED to blink. The duration of each +blink (in milliseconds) can be adjusted by setting +``/sys/class/leds//blink_time``. (But see **check_interval and +blink_time** below.) + +Associate a second device with the LED:: + + # echo /dev/sdb > /sys/class/leds//link_dev_by_path + + # ls /sys/class/leds//linked_devices + sda sdb + +When a block device is linked to one or more LEDs, the LEDs are linked from +the device's ``linked_leds`` directory:: + + # ls /sys/class/block/sd{a,b}/linked_leds + /sys/class/block/sda/linked_leds: + + + /sys/class/block/sdb/linked_leds: + + +(The ``linked_leds`` directory only exists when the block device is linked to +at least one LED.) + +``check_interval`` and ``blink_time`` +===================================== + +* By default, linked block devices are checked for activity every 100 + milliseconds. This frequency can be changed for an LED via the + ``/sys/class/leds//check_interval`` attribute. (The minimum value is 25 + milliseconds.) + +* All block devices associated with an LED are checked for activity every + ``check_interval`` milliseconds, and a blink is triggered if the correct type + of activity (as determined by the LED's ``blink_on_*`` attributes) is + detected. The duration of an LED's blink is determined by its ``blink_time`` + attribute. Thus (when the correct type of activity is detected), the LED will + be on for ``blink_time`` milliseconds and off for + ``check_interval - blink_time`` milliseconds. + +* The LED subsystem ignores new blink requests for an LED that is already in + in the process of blinking, so setting a ``blink_time`` greater than or equal + to ``check_interval`` will cause some blinks to be missed. + +* Because of processing times, scheduling latencies, etc., avoiding missed + blinks actually requires a difference of at least a few milliseconds between + the ``blink_time`` and ``check_interval``. The required difference is likely + to vary from system to system. As a reference, a Thecus N5550 NAS requires a + difference of 7 milliseconds (e.g. ``check_interval == 100``, + ``blink_time == 93``). + +* The default values (``check_interval == 100``, ``blink_time == 75``) cause the + LED associated with a continuously active device to blink rapidly. For a more + "always on" effect, increase the ``blink_time`` (but not too much; see the + previous bullet). + +Other Notes +=========== + +* Many (possibly all) types of block devices work with this trigger, including: + + * SCSI (including SATA and USB) hard disk drives and SSDs + * SCSI (including SATA and USB) optical drives + * NVMe SSDs + * SD cards + * loopback block devices (``/dev/loop*``) + * device mapper devices, such as LVM logical volumes + * MD RAID devices + * zRAM compressed RAM-disks + * partitions on block devices that support them + +* The names of the symbolic links in ``/sys/class/leds//linked_devices`` + are **kernel** names, which may not match the paths used for + ``link_dev_by_path`` and ``unlink_dev_by_path``. This is most likely when a + symbolic link is used to refer to the device (as is common with logical + volumes), but it can be true for any device, because nothing prevents the + creation of device special files with arbitrary names (e.g. + ``sudo mknod /foo b 8 0``). + + Kernel names can be used to unlink block devices from LEDs by writing them to + the LED's ``unlink_dev_by_name`` attribute. + +* The ``blkdev`` LED trigger supports many-to-many device/LED associations. + A device can be associated with multiple LEDs, and an LED can be associated + with multiple devices. diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst index 7f58010ea86a..9de14f3f7783 100644 --- a/Documentation/x86/topology.rst +++ b/Documentation/x86/topology.rst @@ -33,6 +33,7 @@ historical nature and should be cleaned up. The topology of a system is described in the units of: - packages + - cluster - cores - threads @@ -90,6 +91,22 @@ Package-related topology information in the kernel: Cache. In general, it is a number identifying an LLC uniquely on the system. +Clusters +======== +A cluster consists of threads of one or more cores sharing the same L2 cache. + +Cluster-related topology information in the kernel: + + - cluster_id: + + A per-CPU variable containing: + + - Upper bits extracted from the APIC ID. CPUs which have the same value + in these bits share an L2 and have the same cluster_id. + + CPUs for which cluster information is unavailable will show 65535 + (BAD_APICID) as the cluster_id. + Cores ===== A core consists of 1 or more threads. It does not matter whether the threads @@ -125,6 +142,11 @@ Thread-related topology information in the kernel: The number of online threads is also printed in /proc/cpuinfo "siblings." + - topology_cluster_cpumask(): + + The cpumask contains all online threads in the cluster to which a thread + belongs. + - topology_sibling_cpumask(): The cpumask contains all online threads in the core to which a thread @@ -138,6 +160,10 @@ Thread-related topology information in the kernel: The physical package ID to which a thread belongs. + - topology_cluster_id(); + + The ID of the cluster to which a thread belongs. + - topology_core_id(); The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index d995595394bb..19d1fb601796 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. -KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h # sev.c indirectly inludes inat-table.h which is generated during diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 52e6e7ed4f78..f000cc16d128 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -343,14 +343,15 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) return -EINVAL; cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); bit = cfg - 1; + if (bit != PERF_RAPL_PP0) + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + /* check event supported */ if (!(rapl_cntr_mask & (1 << bit))) return -EINVAL; @@ -363,7 +364,15 @@ static int rapl_pmu_event_init(struct perf_event *event) pmu = cpu_to_rapl_pmu(event->cpu); if (!pmu) return -EINVAL; - event->cpu = pmu->cpu; + + /* + * FIXME: RAPL PMU considers events are uncore and MSRs can be read from + * the first available CPU of the die. But this is not true for energy-cores + * event. Therefore as a workaround don't consider pmu->cpu here for PERF_RAPL_PP0. + */ + if (event->event_caps & PERF_EV_CAP_READ_ACTIVE_PKG) + event->cpu = pmu->cpu; + event->pmu_private = pmu; event->hw.event_base = rapl_msrs[bit].msr; event->hw.config = cfg; @@ -537,7 +546,7 @@ static struct perf_msr intel_rapl_spr_msrs[] = { * - want to use same event codes across both architectures */ static struct perf_msr amd_rapl_msrs[] = { - [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, + [PERF_RAPL_PP0] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, @@ -764,7 +773,8 @@ static struct rapl_model model_spr = { }; static struct rapl_model model_amd_hygon = { - .events = BIT(PERF_RAPL_PKG), + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, .rapl_msrs = amd_rapl_msrs, }; diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index ce9685fc78d8..2034cd556c07 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -7,6 +7,7 @@ extern unsigned int memory_caching_control; #define CACHE_MTRR 0x01 #define CACHE_PAT 0x02 +void cacheinfo_topoext_init_l2c_id(struct cpuinfo_x86 *c, int cpu); void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 06f2ede1544f..84c250027a50 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -358,6 +358,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); + cacheinfo_topoext_init_l2c_id(c, cpu); cacheinfo_amd_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index f4e5aa27eec6..bed7b9633451 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -659,6 +659,42 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } +void cacheinfo_topoext_init_l2c_id(struct cpuinfo_x86 *c, int cpu) +{ + u32 eax, ebx, ecx, edx, num_sharing_cache; + int i = 0, bits; + + /* Check if L2 cache identifiers exists. */ + if (!cpuid_ecx(0x80000006)) + return; + + while (true) { + u32 level; + + cpuid_count(0x8000001d, i, &eax, &ebx, &ecx, &edx); + if (!eax) + return; + + /* + * Check if the current leaf is for L2 cache using + * eax[7:5] used to describe the cache level. + */ + level = (eax >> 5) & 0x7; + if (level == 2) + break; + + ++i; + } + + /* + * L2 ID is calculated from the number of threads + * sharing the L2 cache. + */ + num_sharing_cache = ((eax >> 14) & 0xfff) + 1; + bits = get_count_order(num_sharing_cache); + per_cpu(cpu_l2c_id, cpu) = c->apicid >> bits; +} + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 5a2962c492d3..cb0025b4a2fd 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -89,6 +89,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) /* Socket ID is ApicId[6] for these processors. */ c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + cacheinfo_topoext_init_l2c_id(c, cpu); cacheinfo_hygon_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c1e31e9a85d7..92d73ccede70 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1205,7 +1205,7 @@ void __flush_tlb_all(void) */ VM_WARN_ON_ONCE(preemptible()); - if (boot_cpu_has(X86_FEATURE_PGE)) { + if (cpu_feature_enabled(X86_FEATURE_PGE)) { __flush_tlb_global(); } else { /* diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b808be77635e..6e696c6b7018 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip) static int emit_rsb_call(u8 **pprog, void *func, void *ip) { + void *adjusted_ip; OPTIMIZER_HIDE_VAR(func); - x86_call_depth_emit_accounting(pprog, func); - return emit_patch(pprog, func, ip, 0xE8); + adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func); + return emit_patch(pprog, func, adjusted_ip, 0xE8); } static int emit_jump(u8 **pprog, void *func, void *ip) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 5c536151ef83..5a80379253a7 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -912,7 +912,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) } gpiod_set_value_cansleep(reset_gpio, 0); - msleep(200); + usleep_range(USEC_PER_SEC / 2, USEC_PER_SEC); gpiod_set_value_cansleep(reset_gpio, 1); return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9fa1d814508a..43d6a9d6a538 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -453,7 +453,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, /* Limit maximum size to 2GiB due to SG table limitations */ size = min(remaining_size, 2ULL << 30); - if (size >= (u64)pages_per_block << PAGE_SHIFT) + if ((size >= (u64)pages_per_block << PAGE_SHIFT) && + !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; cur_size = size; diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c index 074e70a5c458..e507d2e1410b 100644 --- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c +++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c @@ -516,11 +516,8 @@ static enum bp_result get_gpio_i2c_info( info->i2c_slave_address = record->i2c_slave_addr; /* TODO: check how to get register offset for en, Y, etc. */ - info->gpio_info.clk_a_register_index = - le16_to_cpu( - header->gpio_pin[table_index].data_a_reg_index); - info->gpio_info.clk_a_shift = - header->gpio_pin[table_index].gpio_bitshift; + info->gpio_info.clk_a_register_index = le16_to_cpu(pin->data_a_reg_index); + info->gpio_info.clk_a_shift = pin->gpio_bitshift; return BP_RESULT_OK; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c index 8a0dd0d7134b..481a15b02126 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c @@ -714,7 +714,7 @@ static const struct dc_debug_options debug_defaults_drv = { .timing_trace = false, .clock_trace = true, .disable_pplib_clock_request = true, - .pipe_split_policy = MPC_SPLIT_AVOID_MULT_DISP, + .pipe_split_policy = MPC_SPLIT_DYNAMIC, .force_single_disp_pipe_split = false, .disable_dcc = DCC_ENABLE, .vsr_support = true, diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c index fbcf0afeae0d..ec30d171e7de 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c @@ -642,7 +642,7 @@ static const struct dc_debug_options debug_defaults_drv = { .clock_trace = true, .disable_pplib_clock_request = true, .min_disp_clk_khz = 100000, - .pipe_split_policy = MPC_SPLIT_AVOID_MULT_DISP, + .pipe_split_policy = MPC_SPLIT_DYNAMIC, .force_single_disp_pipe_split = false, .disable_dcc = DCC_ENABLE, .vsr_support = true, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index 0bcd4fe0ef17..5b7a780cbd54 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -304,7 +304,8 @@ navi10_get_allowed_feature_mask(struct smu_context *smu, | FEATURE_MASK(FEATURE_GFX_SS_BIT) | FEATURE_MASK(FEATURE_APCC_DFLL_BIT) | FEATURE_MASK(FEATURE_FW_CTF_BIT) - | FEATURE_MASK(FEATURE_OUT_OF_BAND_MONITOR_BIT); + | FEATURE_MASK(FEATURE_OUT_OF_BAND_MONITOR_BIT) + | FEATURE_MASK(FEATURE_TEMP_DEPENDENT_VMIN_BIT); if (adev->pm.pp_feature & PP_SCLK_DPM_MASK) *(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_GFXCLK_BIT); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index fd22d753b4ed..fcd4bfef7415 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -308,7 +308,8 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched) */ void drm_sched_fault(struct drm_gpu_scheduler *sched) { - mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0); + if (sched->ready) + mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0); } EXPORT_SYMBOL(drm_sched_fault); diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index dc6816d36d06..bda249068182 100644 --- a/drivers/leds/trigger/Kconfig +++ b/drivers/leds/trigger/Kconfig @@ -154,4 +154,13 @@ config LEDS_TRIGGER_TTY When build as a module this driver will be called ledtrig-tty. +config LEDS_TRIGGER_BLKDEV + tristate "LED Trigger for block devices" + depends on BLOCK + help + The blkdev LED trigger allows LEDs to be controlled by block device + activity (reads and writes). + + See Documentation/leds/ledtrig-blkdev.rst. + endif # LEDS_TRIGGERS diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile index 25c4db97cdd4..d53bab5d93f1 100644 --- a/drivers/leds/trigger/Makefile +++ b/drivers/leds/trigger/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV) += ledtrig-netdev.o obj-$(CONFIG_LEDS_TRIGGER_PATTERN) += ledtrig-pattern.o obj-$(CONFIG_LEDS_TRIGGER_AUDIO) += ledtrig-audio.o obj-$(CONFIG_LEDS_TRIGGER_TTY) += ledtrig-tty.o +obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c new file mode 100644 index 000000000000..067eedb003b5 --- /dev/null +++ b/drivers/leds/trigger/ledtrig-blkdev.c @@ -0,0 +1,1221 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Block device LED trigger + * + * Copyright 2021-2022 Ian Pilcher + */ + +#include +#include +#include +#include +#include + +/** + * DOC: Overview + * + * The ``blkdev`` LED trigger works by periodically checking the activity + * counters of block devices that have been linked to one or more LEDs and + * blinking those LED(s) if the correct type of activity has occurred. The + * periodic check is scheduled with the Linux kernel's deferred work facility. + * + * Trigger-specific data about block devices and LEDs is stored in two data + * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led + * (a "BTL"). Each structure contains a &struct xarray that holds links to any + * linked devices of the other type. I.e. &blkdev_trig_bdev.linked_btls + * contains links to all BTLs whose LEDs have been linked to the BTB's block + * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose + * block devices have been linked to the BTL's LED. Thus, a block device can + * be linked to more than one LED, and an LED can be linked to more than one + * block device. + */ + +/* Default, minimum & maximum blink duration (milliseconds) */ +#define BLKDEV_TRIG_BLINK_DEF 75 +#define BLKDEV_TRIG_BLINK_MIN 10 +#define BLKDEV_TRIG_BLINK_MAX 86400000 /* 24 hours */ + +/* Default, minimum & maximum activity check interval (milliseconds) */ +#define BLKDEV_TRIG_CHECK_DEF 100 +#define BLKDEV_TRIG_CHECK_MIN 25 +#define BLKDEV_TRIG_CHECK_MAX 86400000 /* 24 hours */ + +/* + * If blkdev_trig_check() can't lock the mutex, how long to wait before trying + * again (milliseconds) + */ +#define BLKDEV_TRIG_CHECK_RETRY 5 + +/* Mode argument for calls to blkdev_get_by_path() and blkdev_put() */ +#define BLKDEV_TRIG_FMODE 0 + +/** + * struct blkdev_trig_bdev - Trigger-specific data about a block device. + * @last_checked: Time (in jiffies) at which the trigger last checked this + * block device for activity. + * @last_activity: Time (in jiffies) at which the trigger last detected + * activity of each type. + * @ios: Activity counter values for each type, corresponding to + * the timestamps in &last_activity. + * @index: &xarray index, so the BTB can be included in one or more + * &blkdev_trig_led.linked_btbs. + * @bdev: The block device. + * @linked_btls: The BTLs that represent the LEDs linked to the BTB's + * block device. + * + * Every block device linked to at least one LED gets a "BTB." A BTB is created + * when a block device that is not currently linked to any LEDs is linked to an + * LED. + * + * A BTB is freed when one of the following occurs: + * + * * The number of LEDs linked to the block device becomes zero, because it has + * been unlinked from its last LED using the trigger's &sysfs interface. + * + * * The number of LEDs linked to the block device becomes zero, because the + * last LED to which it was linked has been disassociated from the trigger + * (which happens automatically if the LED device is removed from the system). + * + * * The BTB's block device is removed from the system. To accomodate this + * scenario, BTB's are created as device resources, so that the release + * function will be called by the driver core when the device is removed. + */ +struct blkdev_trig_bdev { + unsigned long last_checked; + unsigned long last_activity[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long index; + struct block_device *bdev; + struct xarray linked_btls; +}; + +/** + * struct blkdev_trig_led - Trigger-specific data about an LED. + * @last_checked: Time (in jiffies) at which the trigger last checked the + * the block devices linked to this LED for activity. + * @index: &xarray index, so the BTL can be included in one or more + * &blkdev_trig_bdev.linked_btls. + * @mode: Bitmask for types of block device activity that will + * cause this LED to blink --- reads, writes, discards, + * etc. + * @led: The LED device. + * @blink_msec: Duration of a blink (milliseconds). + * @check_jiffies: Frequency with which block devices linked to this LED + * should be checked for activity (jiffies). + * @linked_btbs: The BTBs that represent the block devices linked to the + * BTL's LED. + * @all_btls_node: The BTL's node in the module's list of all BTLs. + * + * Every LED associated with the block device trigger gets a "BTL." A BTL is + * created when the trigger is "activated" on an LED (usually by writing + * ``blkdev`` to the LED's &sysfs &trigger attribute). A BTL is freed wnen its + * LED is disassociated from the trigger, either through the trigger's &sysfs + * interface or because the LED device is removed from the system. + */ +struct blkdev_trig_led { + unsigned long last_checked; + unsigned long index; + unsigned long mode; /* must be ulong for atomic bit ops */ + struct led_classdev *led; + unsigned int blink_msec; + unsigned int check_jiffies; + struct xarray linked_btbs; + struct hlist_node all_btls_node; +}; + +/* Protects everything except atomic LED attributes */ +static DEFINE_MUTEX(blkdev_trig_mutex); + +/* BTB device resource release function */ +static void blkdev_trig_btb_release(struct device *dev, void *res); + +/* Index for next BTB or BTL */ +static unsigned long blkdev_trig_next_index; + +/* All LEDs associated with the trigger */ +static HLIST_HEAD(blkdev_trig_all_btls); + +/* Delayed work to periodically check for activity & blink LEDs */ +static void blkdev_trig_check(struct work_struct *work); +static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check); + +/* When is the delayed work scheduled to run next (jiffies) */ +static unsigned long blkdev_trig_next_check; + +/* Total number of BTB-to-BTL links */ +static unsigned int blkdev_trig_link_count; + +/* Empty sysfs attribute list for next 2 declarations */ +static struct attribute *blkdev_trig_attrs_empty[] = { NULL }; + +/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */ +static const struct attribute_group blkdev_trig_linked_leds = { + .name = "linked_leds", + .attrs = blkdev_trig_attrs_empty, +}; + +/* linked_devices sysfs directory for each LED associated with the trigger */ +static const struct attribute_group blkdev_trig_linked_devs = { + .name = "linked_devices", + .attrs = blkdev_trig_attrs_empty, +}; + + +/* + * + * Delayed work to check for activity & blink LEDs + * + */ + +/** + * blkdev_trig_blink() - Blink an LED, if the correct type of activity has + * occurred on the block device. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + * Return: &true if the LED is blinked, &false if not. + */ +static bool blkdev_trig_blink(const struct blkdev_trig_led *btl, + const struct blkdev_trig_bdev *btb) +{ + unsigned long mode, mask, delay_on, delay_off; + enum stat_group i; + + mode = READ_ONCE(btl->mode); + + for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) { + + if (!(mode & mask)) + continue; + + if (time_before_eq(btb->last_activity[i], btl->last_checked)) + continue; + + delay_on = READ_ONCE(btl->blink_msec); + delay_off = 1; /* 0 leaves LED turned on */ + + led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0); + return true; + } + + return false; +} + +/** + * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps. + * @btb: The BTB + * @now: Timestamp (in jiffies) + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb, + unsigned long now) +{ + unsigned long new_ios; + enum stat_group i; + + for (i = STAT_READ; i <= STAT_FLUSH; ++i) { + + new_ios = part_stat_read(btb->bdev, ios[i]); + + if (new_ios != btb->ios[i]) { + btb->ios[i] = new_ios; + btb->last_activity[i] = now; + } + } + + btb->last_checked = now; +} + +/** + * blkdev_trig_check() - Check linked devices for activity and blink LEDs. + * @work: Delayed work (&blkdev_trig_work) + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + */ +static void blkdev_trig_check(struct work_struct *work) +{ + struct blkdev_trig_led *btl; + struct blkdev_trig_bdev *btb; + unsigned long index, delay, now, led_check, led_delay; + bool blinked; + + if (!mutex_trylock(&blkdev_trig_mutex)) { + delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY); + goto exit_reschedule; + } + + now = jiffies; + delay = ULONG_MAX; + + hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) { + + led_check = btl->last_checked + btl->check_jiffies; + + if (time_before_eq(led_check, now)) { + + blinked = false; + + xa_for_each (&btl->linked_btbs, index, btb) { + + if (btb->last_checked != now) + blkdev_trig_update_btb(btb, now); + if (!blinked) + blinked = blkdev_trig_blink(btl, btb); + } + + btl->last_checked = now; + led_delay = btl->check_jiffies; + + } else { + led_delay = led_check - now; + } + + if (led_delay < delay) + delay = led_delay; + } + + mutex_unlock(&blkdev_trig_mutex); + +exit_reschedule: + WARN_ON_ONCE(delay == ULONG_MAX); + WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay)); +} + +/** + * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new + * LED is added to the schedule. + * @btl: The BTL that represents the LED + * + * Called when the number of block devices to which an LED is linked becomes + * non-zero. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl) +{ + unsigned long delay = READ_ONCE(btl->check_jiffies); + unsigned long check_by = jiffies + delay; + + /* + * If no other LED-to-block device links exist, simply schedule the + * delayed work according to this LED's check_interval attribute + * (check_jiffies). + */ + if (blkdev_trig_link_count == 0) { + WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay)); + blkdev_trig_next_check = check_by; + return; + } + + /* + * If the next check is already scheduled to occur soon enough to + * accomodate this LED's check_interval, the schedule doesn't need + * to be changed. + */ + if (time_after_eq(check_by, blkdev_trig_next_check)) + return; + + /* + * Modify the schedule, so that the delayed work runs soon enough for + * this LED. + */ + WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay)); + blkdev_trig_next_check = check_by; +} + + +/* + * + * Linking and unlinking LEDs and block devices + * + */ + +/** + * blkdev_trig_link() - Link a block device to an LED. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + * Return: &0 on success, negative &errno on error. + */ +static int blkdev_trig_link(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + bool led_first_link; + int err; + + led_first_link = xa_empty(&btl->linked_btbs); + + err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL); + if (err) + return err; + + err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL); + if (err) + goto error_erase_btl; + + /* Create /sys/class/block//linked_leds/ symlink */ + err = sysfs_add_link_to_group(bdev_kobj(btb->bdev), + blkdev_trig_linked_leds.name, + &btl->led->dev->kobj, btl->led->name); + if (err) + goto error_erase_btb; + + /* Create /sys/class/leds//linked_devices/ symlink */ + err = sysfs_add_link_to_group(&btl->led->dev->kobj, + blkdev_trig_linked_devs.name, + bdev_kobj(btb->bdev), + dev_name(&btb->bdev->bd_device)); + if (err) + goto error_remove_symlink; + + /* + * If this is the first block device linked to this LED, the delayed + * work schedule may need to be changed. + */ + if (led_first_link) + blkdev_trig_sched_led(btl); + + ++blkdev_trig_link_count; + + return 0; + +error_remove_symlink: + sysfs_remove_link_from_group(bdev_kobj(btb->bdev), + blkdev_trig_linked_leds.name, + btl->led->name); +error_erase_btb: + xa_erase(&btl->linked_btbs, btb->index); +error_erase_btl: + xa_erase(&btb->linked_btls, btl->index); + return err; +} + +/** + * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed. + * @btb: The BTB + * + * Does nothing if the BTB (block device) is still linked to at least one LED. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb) +{ + struct block_device *bdev = btb->bdev; + int err; + + if (xa_empty(&btb->linked_btls)) { + + sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); + err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release, + NULL, NULL); + WARN_ON(err); + } +} + +/** + * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of + * unlinking a block device from an LED. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * When a block device is unlinked from an LED, certain steps must be performed + * only if the block device is **not** being released. This function performs + * those steps that are **always** required, whether or not the block device is + * being released. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + --blkdev_trig_link_count; + + if (blkdev_trig_link_count == 0) + WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work)); + + xa_erase(&btb->linked_btls, btl->index); + xa_erase(&btl->linked_btbs, btb->index); + + /* Remove /sys/class/leds//linked_devices/ symlink */ + sysfs_remove_link_from_group(&btl->led->dev->kobj, + blkdev_trig_linked_devs.name, + dev_name(&btb->bdev->bd_device)); +} + +/** + * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is + * **not** being released. + * @btl: The BTL that represents the LED. + * @btb: The BTB that represents the block device. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + _blkdev_trig_unlink_always(btl, btb); + + /* Remove /sys/class/block//linked_leds/ symlink */ + sysfs_remove_link_from_group(bdev_kobj(btb->bdev), + blkdev_trig_linked_leds.name, + btl->led->name); + + blkdev_trig_put_btb(btb); +} + +/** + * blkdev_trig_unlink_release() - Unlink an LED from a block device that is + * being released. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + _blkdev_trig_unlink_always(btl, btb); + + /* + * If the BTB is being released, the driver core has already removed the + * device's attribute groups, and the BTB will be freed automatically, + * so there's nothing else to do. + */ +} + + +/* + * + * BTB creation + * + */ + +/** + * blkdev_trig_btb_release() - BTB device resource release function. + * @dev: The block device + * @res: The BTB + * + * Called by the driver core when a block device with a BTB is removed. + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + */ +static void blkdev_trig_btb_release(struct device *dev, void *res) +{ + struct blkdev_trig_bdev *btb = res; + struct blkdev_trig_led *btl; + unsigned long index; + + mutex_lock(&blkdev_trig_mutex); + + xa_for_each (&btb->linked_btls, index, btl) + blkdev_trig_unlink_release(btl, btb); + + mutex_unlock(&blkdev_trig_mutex); +} + +/** + * blkdev_trig_get_bdev() - Get a block device by path. + * @path: The value written to an LED's &link_dev_by_path or + * &unlink_dev_by_path attribute, which should be the path to a + * special file that represents a block device + * @len: The number of characters in &path (not including its + * terminating null) + * + * The caller must call blkdev_put() when finished with the device. + * + * Context: Process context. + * Return: The block device, or an error pointer. + */ +static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len) +{ + struct block_device *bdev; + char *buf; + + buf = kmemdup(path, len + 1, GFP_KERNEL); /* +1 to include null */ + if (buf == NULL) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(strim(buf), BLKDEV_TRIG_FMODE, THIS_MODULE); + kfree(buf); + return bdev; +} + +/** + * blkdev_trig_get_btb() - Find or create the BTB for a block device. + * @path: The value written to an LED's &link_dev_by_path attribute, + * which should be the path to a special file that represents a + * block device + * @len: The number of characters in &path + * + * If a new BTB is created, because the block device was not previously linked + * to any LEDs, the block device's &linked_leds &sysfs directory is created. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + * Return: Pointer to the BTB, error pointer on error. + */ +static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path, + size_t len) +{ + struct block_device *bdev; + struct blkdev_trig_bdev *btb; + int err; + + bdev = blkdev_trig_get_bdev(path, len); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, + NULL, NULL); + if (btb != NULL) { + err = 0; + goto exit_put_bdev; + } + + if (blkdev_trig_next_index == ULONG_MAX) { + err = -EOVERFLOW; + goto exit_put_bdev; + } + + btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL); + if (btb == NULL) { + err = -ENOMEM; + goto exit_put_bdev; + } + + err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); + if (err) + goto exit_free_btb; + + btb->index = blkdev_trig_next_index++; + btb->bdev = bdev; + xa_init(&btb->linked_btls); + + /* Populate BTB activity counters */ + blkdev_trig_update_btb(btb, jiffies); + + devres_add(&bdev->bd_device, btb); + +exit_free_btb: + if (err) + devres_free(btb); +exit_put_bdev: + blkdev_put(bdev, BLKDEV_TRIG_FMODE); + return err ? ERR_PTR(err) : btb; +} + + +/* + * + * Activating and deactivating the trigger on an LED + * + */ + +/** + * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is + * associated with the trigger. + * @led: The LED + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &0 on success, negative &errno on error. + */ +static int blkdev_trig_activate(struct led_classdev *led) +{ + struct blkdev_trig_led *btl; + int err; + + btl = kzalloc(sizeof(*btl), GFP_KERNEL); + if (btl == NULL) + return -ENOMEM; + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + goto exit_free; + + if (blkdev_trig_next_index == ULONG_MAX) { + err = -EOVERFLOW; + goto exit_unlock; + } + + btl->index = blkdev_trig_next_index++; + btl->last_checked = jiffies; + btl->mode = -1; /* set all bits */ + btl->led = led; + btl->blink_msec = BLKDEV_TRIG_BLINK_DEF; + btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF); + xa_init(&btl->linked_btbs); + + hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls); + led_set_trigger_data(led, btl); + +exit_unlock: + mutex_unlock(&blkdev_trig_mutex); +exit_free: + if (err) + kfree(btl); + return err; +} + +/** + * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is + * disassociated from the trigger. + * @led: The LED + * + * The LEDs subsystem also calls this function when an LED associated with the + * trigger is removed or when the trigger is unregistered (if the module is + * unloaded). + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + */ +static void blkdev_trig_deactivate(struct led_classdev *led) +{ + struct blkdev_trig_led *btl = led_get_trigger_data(led); + struct blkdev_trig_bdev *btb; + unsigned long index; + + mutex_lock(&blkdev_trig_mutex); + + xa_for_each (&btl->linked_btbs, index, btb) + blkdev_trig_unlink_norelease(btl, btb); + + hlist_del(&btl->all_btls_node); + kfree(btl); + + mutex_unlock(&blkdev_trig_mutex); +} + + +/* + * + * Link-related attribute store functions + * + */ + +/** + * link_dev_by_path_store() - &link_dev_by_path device attribute store function. + * @dev: The LED device + * @attr: The &link_dev_by_path attribute (&dev_attr_link_dev_by_path) + * @buf: The value written to the attribute, which should be the path to + * a special file that represents a block device to be linked to + * the LED (e.g. ``/dev/sda``) + * @count: The number of characters in &buf + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &count on success, negative &errno on error. + */ +static ssize_t link_dev_by_path_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + struct blkdev_trig_bdev *btb; + int err; + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + return err; + + btb = blkdev_trig_get_btb(buf, count); + if (IS_ERR(btb)) { + err = PTR_ERR(btb); + goto exit_unlock; + } + + if (xa_load(&btb->linked_btls, btl->index) != NULL) { + err = -EEXIST; + goto exit_put_btb; + } + + err = blkdev_trig_link(btl, btb); + +exit_put_btb: + if (err) + blkdev_trig_put_btb(btb); +exit_unlock: + mutex_unlock(&blkdev_trig_mutex); + return err ? : count; +} + +/** + * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store + * function. + * @dev: The LED device + * @attr: The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path) + * @buf: The value written to the attribute, which should be the path to + * a special file that represents a block device to be unlinked + * from the LED (e.g. ``/dev/sda``) + * @count: The number of characters in &buf + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &count on success, negative &errno on error. + */ +static ssize_t unlink_dev_by_path_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + struct block_device *bdev; + struct blkdev_trig_bdev *btb; + int err; + + bdev = blkdev_trig_get_bdev(buf, count); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + goto exit_put_bdev; + + btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, + NULL, NULL); + if (btb == NULL) { + err = -EUNATCH; /* bdev isn't linked to any LED */ + goto exit_unlock; + } + + if (xa_load(&btb->linked_btls, btl->index) == NULL) { + err = -EUNATCH; /* bdev isn't linked to this LED */ + goto exit_unlock; + } + + blkdev_trig_unlink_norelease(btl, btb); + +exit_unlock: + mutex_unlock(&blkdev_trig_mutex); +exit_put_bdev: + blkdev_put(bdev, BLKDEV_TRIG_FMODE); + return err ? : count; +} + +/** + * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store + * function. + * @dev: The LED device + * @attr: The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name) + * @buf: The value written to the attribute, which should be the kernel + * name of a block device to be unlinked from the LED (e.g. + * ``sda``) + * @count: The number of characters in &buf + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &count on success, negative &errno on error. + */ +static ssize_t unlink_dev_by_name_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + struct blkdev_trig_bdev *btb; + unsigned long index; + int err; + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + return err; + + err = -EUNATCH; + + xa_for_each (&btl->linked_btbs, index, btb) { + + if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) { + blkdev_trig_unlink_norelease(btl, btb); + err = 0; + break; + } + } + + mutex_unlock(&blkdev_trig_mutex); + return err ? : count; +} + + +/* + * + * Atomic attribute show & store functions + * + */ + +/** + * blink_time_show() - &blink_time device attribute show function. + * @dev: The LED device + * @attr: The &blink_time attribute (&dev_attr_blink_time) + * @buf: Output buffer + * + * Writes the value of &blkdev_trig_led.blink_msec to &buf. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + + return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec)); +} + +/** + * blink_time_store() - &blink_time device attribute store function. + * @dev: The LED device + * @attr: The &blink_time attribute (&dev_attr_blink_time) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets &blkdev_trig_led.blink_msec to the value in &buf. + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_time_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + unsigned int value; + int err; + + err = kstrtouint(buf, 0, &value); + if (err) + return err; + + if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX) + return -ERANGE; + + WRITE_ONCE(btl->blink_msec, value); + return count; +} + +/** + * check_interval_show() - &check_interval device attribute show function. + * @dev: The LED device + * @attr: The &check_interval attribute (&dev_attr_check_interval) + * @buf: Output buffer + * + * Writes the value of &blkdev_trig_led.check_jiffies (converted to + * milliseconds) to &buf. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t check_interval_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + + return sysfs_emit(buf, "%u\n", + jiffies_to_msecs(READ_ONCE(btl->check_jiffies))); +} + +/** + * check_interval_store() - &check_interval device attribute store function + * @dev: The LED device + * @attr: The &check_interval attribute (&dev_attr_check_interval) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting + * from milliseconds). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t check_interval_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *led = led_trigger_get_drvdata(dev); + unsigned int value; + int err; + + err = kstrtouint(buf, 0, &value); + if (err) + return err; + + if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX) + return -ERANGE; + + WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value)); + + return count; +} + +/** + * blkdev_trig_mode_show() - Helper for boolean attribute show functions. + * @led: The LED + * @buf: Output buffer + * @bit: Which bit to show + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf, + enum stat_group bit) +{ + return sysfs_emit(buf, + READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n"); +} + +/** + * blkdev_trig_mode_store() - Helper for boolean attribute store functions. + * @led: The LED + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * @bit: Which bit to set + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static int blkdev_trig_mode_store(struct blkdev_trig_led *led, + const char *buf, size_t count, + enum stat_group bit) +{ + bool set; + int err; + + err = kstrtobool(buf, &set); + if (err) + return err; + + if (set) + set_bit(bit, &led->mode); + else + clear_bit(bit, &led->mode); + + return count; +} + +/** + * blink_on_read_show() - &blink_on_read device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in + * &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_read_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_READ); +} + +/** + * blink_on_read_store() - &blink_on_read device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_read_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_READ); +} + +/** + * blink_on_write_show() - &blink_on_write device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in + * in &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_write_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_WRITE); +} + +/** + * blink_on_write_store() - &blink_on_write device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_write_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_WRITE); +} + +/** + * blink_on_flush_show() - &blink_on_flush device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in + * &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_flush_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_FLUSH); +} + +/** + * blink_on_flush_store() - &blink_on_flush device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_flush_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_FLUSH); +} + +/** + * blink_on_discard_show() - &blink_on_discard device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in + * &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_discard_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_DISCARD); +} + +/** + * blink_on_discard_store() - &blink_on_discard device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_discard_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_DISCARD); +} + +/* Device attributes */ +static DEVICE_ATTR_WO(link_dev_by_path); +static DEVICE_ATTR_WO(unlink_dev_by_path); +static DEVICE_ATTR_WO(unlink_dev_by_name); +static DEVICE_ATTR_RW(blink_time); +static DEVICE_ATTR_RW(check_interval); +static DEVICE_ATTR_RW(blink_on_read); +static DEVICE_ATTR_RW(blink_on_write); +static DEVICE_ATTR_RW(blink_on_flush); +static DEVICE_ATTR_RW(blink_on_discard); + +/* Device attributes in LED directory (/sys/class/leds//...) */ +static struct attribute *blkdev_trig_attrs[] = { + &dev_attr_link_dev_by_path.attr, + &dev_attr_unlink_dev_by_path.attr, + &dev_attr_unlink_dev_by_name.attr, + &dev_attr_blink_time.attr, + &dev_attr_check_interval.attr, + &dev_attr_blink_on_read.attr, + &dev_attr_blink_on_write.attr, + &dev_attr_blink_on_flush.attr, + &dev_attr_blink_on_discard.attr, + NULL +}; + +/* Unnamed attribute group == no subdirectory */ +static const struct attribute_group blkdev_trig_attr_group = { + .attrs = blkdev_trig_attrs, +}; + +/* Attribute groups for the trigger */ +static const struct attribute_group *blkdev_trig_attr_groups[] = { + &blkdev_trig_attr_group, /* /sys/class/leds//... */ + &blkdev_trig_linked_devs, /* /sys/class/leds//linked_devices/ */ + NULL +}; + +/* Trigger registration data */ +static struct led_trigger blkdev_trig_trigger = { + .name = "blkdev", + .activate = blkdev_trig_activate, + .deactivate = blkdev_trig_deactivate, + .groups = blkdev_trig_attr_groups, +}; + +/** + * blkdev_trig_init() - Block device LED trigger initialization. + * + * Registers the ``blkdev`` LED trigger. + * + * Return: &0 on success, negative &errno on failure. + */ +static int __init blkdev_trig_init(void) +{ + return led_trigger_register(&blkdev_trig_trigger); +} +module_init(blkdev_trig_init); + +/** + * blkdev_trig_exit() - Block device LED trigger module exit. + * + * Unregisters the ``blkdev`` LED trigger. + */ +static void __exit blkdev_trig_exit(void) +{ + led_trigger_unregister(&blkdev_trig_trigger); +} +module_exit(blkdev_trig_exit); + +MODULE_DESCRIPTION("Block device LED trigger"); +MODULE_AUTHOR("Ian Pilcher "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/init.c b/drivers/net/wireless/mediatek/mt76/mt7921/init.c index d4b681d7e1d2..f2c6ec4d8e2e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/init.c @@ -162,12 +162,12 @@ mt7921_mac_init_band(struct mt7921_dev *dev, u8 band) u8 mt7921_check_offload_capability(struct device *dev, const char *fw_wm) { - struct mt7921_fw_features *features = NULL; const struct mt76_connac2_fw_trailer *hdr; struct mt7921_realease_info *rel_info; const struct firmware *fw; int ret, i, offset = 0; const u8 *data, *end; + u8 offload_caps = 0; ret = request_firmware(&fw, fw_wm, dev); if (ret) @@ -199,7 +199,10 @@ u8 mt7921_check_offload_capability(struct device *dev, const char *fw_wm) data += sizeof(*rel_info); if (rel_info->tag == MT7921_FW_TAG_FEATURE) { + struct mt7921_fw_features *features; + features = (struct mt7921_fw_features *)data; + offload_caps = features->data; break; } @@ -209,7 +212,7 @@ u8 mt7921_check_offload_capability(struct device *dev, const char *fw_wm) out: release_firmware(fw); - return features ? features->data : 0; + return offload_caps; } EXPORT_SYMBOL_GPL(mt7921_check_offload_capability); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 64659b110973..4cad490028ab 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -57,13 +57,7 @@ * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). - * Then we also need a global mutex to serialize eventpoll_release_file() - * and ep_free(). - * This mutex is acquired by ep_free() during the epoll file - * cleanup path and it is also acquired by eventpoll_release_file() - * if a file has been pushed inside an epoll set and it is then - * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). - * It is also acquired when inserting an epoll fd onto another epoll + * The epmutex is acquired when inserting an epoll fd onto another epoll * fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two @@ -153,6 +147,13 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; + /* + * Protected by file->f_lock, true for to-be-released epitem already + * removed from the "struct file" items list; together with + * eventpoll->refcount orchestrates "struct eventpoll" disposal + */ + bool dying; + /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -217,6 +218,12 @@ struct eventpoll { u64 gen; struct hlist_head refs; + /* + * usage count, used together with epitem->dying to + * orchestrate the disposal of this struct + */ + refcount_t refcount; + #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; @@ -240,9 +247,7 @@ struct ep_pqueue { /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; -/* - * This mutex is used to serialize ep_free() and eventpoll_release_file(). - */ +/* Used for cycles detection */ static DEFINE_MUTEX(epmutex); static u64 loop_check_gen = 0; @@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) /* * This function unregisters poll callbacks from the associated file - * descriptor. Must be called with "mtx" held (or "epmutex" if called from - * ep_free). + * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { @@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +static void ep_get(struct eventpoll *ep) +{ + refcount_inc(&ep->refcount); +} + +/* + * Returns true if the event poll can be disposed + */ +static bool ep_refcount_dec_and_test(struct eventpoll *ep) +{ + if (!refcount_dec_and_test(&ep->refcount)) + return false; + + WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); + return true; +} + +static void ep_free(struct eventpoll *ep) +{ + mutex_destroy(&ep->mtx); + free_uid(ep->user); + wakeup_source_unregister(ep->ws); + kfree(ep); +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. + * If the dying flag is set, do the removal only if force is true. + * This prevents ep_clear_and_put() from dropping all the ep references + * while running concurrently with eventpoll_release_file(). + * Returns true if the eventpoll can be disposed. */ -static int ep_remove(struct eventpoll *ep, struct epitem *epi) +static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; @@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); + if (epi->dying && !force) { + spin_unlock(&file->f_lock); + return false; + } + to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { @@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) call_rcu(&epi->rcu, epi_rcu_free); percpu_counter_dec(&ep->user->epoll_watches); + return ep_refcount_dec_and_test(ep); +} - return 0; +/* + * ep_remove variant for callers owing an additional reference to the ep + */ +static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) +{ + WARN_ON_ONCE(__ep_remove(ep, epi, false)); } -static void ep_free(struct eventpoll *ep) +static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp; struct epitem *epi; + bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() while we're freeing the "struct eventpoll". - * We do not need to hold "ep->mtx" here because the epoll file - * is on the way to be removed and no one has references to it - * anymore. The only hit might come from eventpoll_release_file() but - * holding "epmutex" is sufficient here. - */ - mutex_lock(&epmutex); + mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. @@ -768,25 +806,21 @@ static void ep_free(struct eventpoll *ep) /* * Walks through the whole tree by freeing each "struct epitem". At this - * point we are sure no poll callbacks will be lingering around, and also by - * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->lock". - * We do not need to lock ep->mtx, either, we only do it to prevent - * a lockdep warning. + * point we are sure no poll callbacks will be lingering around. + * Since we still own a reference to the eventpoll struct, the loop can't + * dispose it. */ - mutex_lock(&ep->mtx); while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); - ep_remove(ep, epi); + ep_remove_safe(ep, epi); cond_resched(); } + + dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - mutex_unlock(&epmutex); - mutex_destroy(&ep->mtx); - free_uid(ep->user); - wakeup_source_unregister(ep->ws); - kfree(ep); + if (dispose) + ep_free(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) @@ -794,7 +828,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) struct eventpoll *ep = file->private_data; if (ep) - ep_free(ep); + ep_clear_and_put(ep); return 0; } @@ -906,33 +940,34 @@ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; - struct hlist_node *next; + bool dispose; /* - * We don't want to get "file->f_lock" because it is not - * necessary. It is not necessary because we're in the "struct file" - * cleanup path, and this means that no one is using this file anymore. - * So, for example, epoll_ctl() cannot hit here since if we reach this - * point, the file counter already went to zero and fget() would fail. - * The only hit might come from ep_free() but by holding the mutex - * will correctly serialize the operation. We do need to acquire - * "ep->mtx" after "epmutex" because ep_remove() requires it when called - * from anywhere but ep_free(). - * - * Besides, ep_remove() acquires the lock, so we can't hold it here. + * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from + * touching the epitems list before eventpoll_release_file() can access + * the ep->mtx. */ - mutex_lock(&epmutex); - if (unlikely(!file->f_ep)) { - mutex_unlock(&epmutex); - return; - } - hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { +again: + spin_lock(&file->f_lock); + if (file->f_ep && file->f_ep->first) { + epi = hlist_entry(file->f_ep->first, struct epitem, fllink); + epi->dying = true; + spin_unlock(&file->f_lock); + + /* + * ep access is safe as we still own a reference to the ep + * struct + */ ep = epi->ep; - mutex_lock_nested(&ep->mtx, 0); - ep_remove(ep, epi); + mutex_lock(&ep->mtx); + dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); + + if (dispose) + ep_free(ep); + goto again; } - mutex_unlock(&epmutex); + spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) @@ -955,6 +990,7 @@ static int ep_alloc(struct eventpoll **pep) ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; + refcount_set(&ep->refcount, 1); *pep = ep; @@ -1223,10 +1259,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v */ list_del_init(&wait->entry); /* - * ->whead != NULL protects us from the race with ep_free() - * or ep_remove(), ep_remove_wait_queue() takes whead->lock - * held by the caller. Once we nullify it, nothing protects - * ep/epi or even wait. + * ->whead != NULL protects us from the race with + * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() + * takes whead->lock held by the caller. Once we nullify it, + * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } @@ -1496,16 +1532,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (tep) mutex_unlock(&tep->mtx); + /* + * ep_remove_safe() calls in the later error paths can't lead to + * ep_free() as the ep file itself still holds an ep reference. + */ + ep_get(ep); + /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return error; } } @@ -1529,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * high memory pressure. */ if (unlikely(!epq.epi)) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return -ENOMEM; } @@ -1760,7 +1802,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, { int ret = default_wake_function(wq_entry, mode, sync, key); - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } @@ -2025,7 +2067,7 @@ static int do_epoll_create(int flags) out_free_fd: put_unused_fd(fd); out_free_ep: - ep_free(ep); + ep_clear_and_put(ep); return error; } @@ -2167,10 +2209,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -EEXIST; break; case EPOLL_CTL_DEL: - if (epi) - error = ep_remove(ep, epi); - else + if (epi) { + /* + * The eventpoll itself is still alive: the refcount + * can't go to zero here. + */ + ep_remove_safe(ep, epi); + error = 0; + } else { error = -ENOENT; + } break; case EPOLL_CTL_MOD: if (epi) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b..ac9ebe972be0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "zero_pages_sharing %lu\n", mm->ksm_zero_pages_sharing); mmput(mm); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e41359d364a..87063f915363 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -776,7 +776,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM - * merging. + * merging (not including ksm_zero_pages_sharing). */ unsigned long ksm_merging_pages; /* @@ -784,6 +784,11 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages_sharing; #endif #ifdef CONFIG_LRU_GEN struct { diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 5f1ae07d724b..97cda629c9e9 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -48,7 +48,7 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +#define pageblock_order PAGE_ALLOC_COSTLY_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/kernel/kheaders.c b/kernel/kheaders.c index 8f69772af77b..42163c9e94e5 100644 --- a/kernel/kheaders.c +++ b/kernel/kheaders.c @@ -26,15 +26,15 @@ asm ( " .popsection \n" ); -extern char kernel_headers_data; -extern char kernel_headers_data_end; +extern char kernel_headers_data[]; +extern char kernel_headers_data_end[]; static ssize_t ikheaders_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { - memcpy(buf, &kernel_headers_data + off, len); + memcpy(buf, &kernel_headers_data[off], len); return len; } @@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = { static int __init ikheaders_init(void) { - kheaders_attr.size = (&kernel_headers_data_end - - &kernel_headers_data); + kheaders_attr.size = (kernel_headers_data_end - + kernel_headers_data); return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); } diff --git a/kernel/kthread.c b/kernel/kthread.c index f97fd01a2932..7e6751b29101 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker); * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. + * + * Note that this function is not responsible for handling delayed work, so + * caller should be responsible for queuing or canceling all delayed work items + * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { @@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) kthread_flush_worker(worker); kthread_stop(task); + WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } diff --git a/kernel/padata.c b/kernel/padata.c index e007b8a4b738..7c80301ab084 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -45,7 +45,7 @@ struct padata_mt_job_state { }; static void padata_free_pd(struct parallel_data *pd); -static void __init padata_mt_helper(struct work_struct *work); +static void padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -438,7 +438,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) return err; } -static void __init padata_mt_helper(struct work_struct *w) +static void padata_mt_helper(struct work_struct *w) { struct padata_work *pw = container_of(w, struct padata_work, pw_work); struct padata_mt_job_state *ps = pw->pw_data; diff --git a/lib/string.c b/lib/string.c index 4fb566ea610f..3d55ef890106 100644 --- a/lib/string.c +++ b/lib/string.c @@ -480,13 +480,11 @@ EXPORT_SYMBOL(strcspn); */ char *strpbrk(const char *cs, const char *ct) { - const char *sc1, *sc2; + const char *sc; - for (sc1 = cs; *sc1 != '\0'; ++sc1) { - for (sc2 = ct; *sc2 != '\0'; ++sc2) { - if (*sc1 == *sc2) - return (char *)sc1; - } + for (sc = cs; *sc != '\0'; ++sc) { + if (strchr(ct, *sc)) + return (char *)sc; } return NULL; } diff --git a/mm/compaction.c b/mm/compaction.c index d0b16a5b30f7..3613d7f174dc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -122,7 +122,6 @@ bool PageMovable(struct page *page) return false; } -EXPORT_SYMBOL(PageMovable); void __SetPageMovable(struct page *page, const struct movable_operations *mops) { @@ -1102,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation) + * fully scanned (e.g. dirty/writeback pages, parallel allocation) * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && - !cc->rescan && !cc->contended) { + !cc->finish_pageblock && !cc->contended) { ++low_pfn; break; } @@ -1172,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Updated the cached scanner pfn once the pageblock has been scanned + * Update the cached scanner pfn once the pageblock has been scanned. * Pages will either be migrated in which case there is no point * scanning in the near future or migration failed in which case the * failure reason may persist. The block is marked for skipping if * there were no pages isolated in the block or if the block is * rescanned twice in a row. */ - if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -1762,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->ignore_skip_hint) return pfn; + /* + * If the pageblock should be finished then do not select a different + * pageblock. + */ + if (cc->finish_pageblock) + return pfn; + /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous @@ -1839,7 +1845,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } @@ -2375,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long iteration_start_pfn = cc->migrate_pfn; /* - * Avoid multiple rescans which can happen if a page cannot be - * isolated (dirty/writeback in async mode) or if the migrated - * pages are being allocated before the pageblock is cleared. - * The first rescan will capture the entire pageblock for - * migration. If it fails, it'll be marked skip and scanning - * will proceed as normal. + * Avoid multiple rescans of the same pageblock which can + * happen if a page cannot be isolated (dirty/writeback in + * async mode) or if the migrated pages are being allocated + * before the pageblock is cleared. The first rescan will + * capture the entire pageblock for migration. If it fails, + * it'll be marked skip and scanning will proceed as normal. */ - cc->rescan = false; + cc->finish_pageblock = false; if (pageblock_start_pfn(last_migrated_pfn) == pageblock_start_pfn(iteration_start_pfn)) { - cc->rescan = true; + cc->finish_pageblock = true; } +rescan: switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; @@ -2430,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto out; } /* - * We failed to migrate at least one page in the current - * order-aligned block, so skip the rest of it. + * If an ASYNC or SYNC_LIGHT fails to migrate a page + * within the current order-aligned block, scan the + * remainder of the pageblock. This will mark the + * pageblock "skip" to avoid rescanning in the near + * future. This will isolate more pages than necessary + * for the request but avoid loops due to + * fast_find_migrateblock revisiting blocks that were + * recently partially scanned. */ - if (cc->direct_compaction && - (cc->mode == MIGRATE_ASYNC)) { - cc->migrate_pfn = block_end_pfn( - cc->migrate_pfn - 1, cc->order); - /* Draining pcplists is useless in this case */ - last_migrated_pfn = 0; + if (cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { + cc->finish_pageblock = true; + + /* + * Draining pcplists does not help THP if + * any page failed to migrate. Even after + * drain, the pageblock will not be free. + */ + if (cc->order == COMPACTION_HPAGE_ORDER) + last_migrated_pfn = 0; + + goto rescan; } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + check_drain: /* * Has the migration scanner moved away from the previous @@ -2460,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) last_migrated_pfn = 0; } } - - /* Stop if a page has been captured */ - if (capc && capc->page) { - ret = COMPACT_SUCCESS; - break; - } } out: diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032d..21466d0ab22f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -422,7 +422,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; diff --git a/mm/ksm.c b/mm/ksm.c index 739814145818..ee60890cf9b1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -214,6 +214,7 @@ struct ksm_rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -275,6 +276,9 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* The number of zero pages placed by KSM use_zero_pages */ +static unsigned long ksm_zero_pages_sharing; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -420,6 +424,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +enum break_ksm_pmd_entry_return_flag { + HAVE_KSM_PAGE = 1, + HAVE_ZERO_PAGE +}; + static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -427,6 +436,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex spinlock_t *ptl; pte_t *pte; int ret; + bool is_zero_page = false; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return 0; @@ -434,6 +444,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte_present(*pte)) { page = vm_normal_page(walk->vma, addr, *pte); + if (!page) + is_zero_page = is_zero_pfn(pte_pfn(*pte)); } else if (!pte_none(*pte)) { swp_entry_t entry = pte_to_swp_entry(*pte); @@ -444,7 +456,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex if (is_migration_entry(entry)) page = pfn_swap_entry_to_page(entry); } - ret = page && PageKsm(page); + + if (page && PageKsm(page)) + ret = HAVE_KSM_PAGE; + else if (is_zero_page) + ret = HAVE_ZERO_PAGE; + else + ret = 0; + pte_unmap_unlock(pte, ptl); return ret; } @@ -466,19 +485,22 @@ static const struct mm_walk_ops break_ksm_ops = { * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, + bool unshare_zero_page) { vm_fault_t ret = 0; do { - int ksm_page; + int walk_result; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, + walk_result = walk_page_range_vma(vma, addr, addr + 1, &break_ksm_ops, NULL); - if (WARN_ON_ONCE(ksm_page < 0)) - return ksm_page; - if (!ksm_page) + if (WARN_ON_ONCE(walk_result < 0)) + return walk_result; + if (!walk_result) + return 0; + if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, @@ -539,7 +561,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr); + break_ksm(vma, addr, false); mmap_read_unlock(mm); } @@ -764,6 +786,33 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, return NULL; } +/* + * Cleaning the rmap_item's ZERO_PAGE_FLAG + * This function will be called when unshare or writing on zero pages. + */ +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) +{ + if (rmap_item->address & ZERO_PAGE_FLAG) { + ksm_zero_pages_sharing--; + rmap_item->mm->ksm_zero_pages_sharing--; + rmap_item->address &= PAGE_MASK; + } +} + +/* Only called when rmap_item is going to be freed */ +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item) +{ + struct vm_area_struct *vma; + + if (rmap_item->address & ZERO_PAGE_FLAG) { + vma = vma_lookup(rmap_item->mm, rmap_item->address); + if (vma && !ksm_test_exit(rmap_item->mm)) + break_ksm(vma, rmap_item->address, true); + } + /* Put at last. */ + clean_rmap_item_zero_flag(rmap_item); +} + /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. @@ -824,6 +873,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); + unshare_zero_pages(rmap_item); free_rmap_item(rmap_item); } } @@ -853,7 +903,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, if (signal_pending(current)) err = -ERESTARTSYS; else - err = break_ksm(vma, addr); + err = break_ksm(vma, addr, false); } return err; } @@ -2051,6 +2101,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, rmap_item->mm->ksm_merging_pages++; } +static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, + struct page *page) +{ + struct mm_struct *mm = rmap_item->mm; + int err = 0; + + /* + * It should not take ZERO_PAGE_FLAG because on one hand, + * get_next_rmap_item don't return zero pages' rmap_item. + * On the other hand, even if zero page was writen as + * anonymous page, rmap_item has been cleaned after + * stable_tree_search + */ + if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) { + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, rmap_item->address); + if (vma) { + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + if (!err) { + rmap_item->address |= ZERO_PAGE_FLAG; + ksm_zero_pages_sharing++; + rmap_item->mm->ksm_zero_pages_sharing++; + } + } else { + /* If the vma is out of date, we do not need to continue. */ + err = 0; + } + mmap_read_unlock(mm); + } + + return err; +} + /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can @@ -2062,7 +2148,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { - struct mm_struct *mm = rmap_item->mm; struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; @@ -2099,6 +2184,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } remove_rmap_item_from_tree(rmap_item); + clean_rmap_item_zero_flag(rmap_item); if (kpage) { if (PTR_ERR(kpage) == -EBUSY) @@ -2135,29 +2221,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ - if (ksm_use_zero_pages && (checksum == zero_checksum)) { - struct vm_area_struct *vma; - - mmap_read_lock(mm); - vma = find_mergeable_vma(mm, rmap_item->address); - if (vma) { - err = try_to_merge_one_page(vma, page, - ZERO_PAGE(rmap_item->address)); - } else { + if (ksm_use_zero_pages) { + if (checksum == zero_checksum) /* - * If the vma is out of date, we do not need to - * continue. + * In case of failure, the page was not really empty, so we + * need to continue. Otherwise we're done. */ - err = 0; - } - mmap_read_unlock(mm); - /* - * In case of failure, the page was not really empty, so we - * need to continue. Otherwise we're done. - */ - if (!err) - return; + if (!try_to_merge_with_kernel_zero_page(rmap_item, page)) + return; } + tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { @@ -2221,23 +2294,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } } -static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, - struct ksm_rmap_item **rmap_list, - unsigned long addr) +static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr, + struct ksm_rmap_item **rmap_list) { - struct ksm_rmap_item *rmap_item; - while (*rmap_list) { - rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; + if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; + /* + * If we end up here, the VMA is MADV_UNMERGEABLE or its page + * is ineligible or discarded, e.g. MADV_DONTNEED. + */ remove_rmap_item_from_tree(rmap_item); + unshare_zero_pages(rmap_item); free_rmap_item(rmap_item); } + return NULL; +} + +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, + unsigned long addr) +{ + struct ksm_rmap_item *rmap_item; + + rmap_item = try_to_get_old_rmap_item(addr, rmap_list); + if (rmap_item) + return rmap_item; + rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ @@ -2344,6 +2433,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) } if (is_zone_device_page(*page)) goto next_page; + if (is_zero_pfn(page_to_pfn(*page))) { + /* + * To monitor ksm zero pages which becomes non-anonymous, + * we have to save each rmap_item of zero pages by + * try_to_get_old_rmap_item() walking on + * ksm_scan.rmap_list, otherwise their rmap_items will be + * freed by the next turn of get_next_rmap_item(). The + * function get_next_rmap_item() will free all "skipped" + * rmap_items because it thinks its areas as UNMERGEABLE. + */ + rmap_item = try_to_get_old_rmap_item(ksm_scan.address, + ksm_scan.rmap_list); + if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG)) + ksm_scan.rmap_list = &rmap_item->rmap_list; + goto next_page; + } if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); @@ -3145,6 +3250,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t zero_pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%ld\n", ksm_zero_pages_sharing); +} +KSM_ATTR_RO(zero_pages_sharing); + static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3200,6 +3312,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &zero_pages_sharing_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dab67b14e178..65d01266a95b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3120,6 +3120,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { unsigned long flags; int i, allocated = 0; + struct list_head *prev_tail = list->prev; + struct page *pos, *n; spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { @@ -3128,9 +3130,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; - if (unlikely(check_pcp_refill(page, order))) - continue; - /* * Split buddy pages returned by expand() are received here in * physical page order. The page is added to the tail of @@ -3142,7 +3141,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -3156,6 +3154,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); + + /* + * Pages are appended to the pcp list without checking to reduce the + * time holding the zone lock. Checking the appended pages happens right + * after the critical section while still holding the pcp lock. + */ + pos = list_first_entry(prev_tail, struct page, pcp_list); + list_for_each_entry_safe_from(pos, n, list, pcp_list) { + if (unlikely(check_pcp_refill(pos, order))) { + list_del(&pos->pcp_list); + continue; + } + + allocated++; + } + return allocated; } diff --git a/mm/z3fold.c b/mm/z3fold.c index a4de0c317ac7..0cef845d397b 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) struct z3fold_header *zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); if (test_bit(PAGE_HEADLESS, &page->private)) @@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 702bc3fd687a..9d27d9b00bce 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); @@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); /* The page is locked, so this pointer must remain valid */ @@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page) { struct zspage *zspage; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 0edfdb40364b..ae52d3b3f063 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN $@ .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \ vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE - $(call if_changed,gen_initcalls_lds) + +$(call if_changed,gen_initcalls_lds) targets := .tmp_initcalls.lds diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index f7815ee24f83..e94b0a6b96df 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1240,7 +1240,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd if (strncmp(hid, "CLSA0100", 8) == 0) { hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH; - } else if (strncmp(hid, "CLSA0101", 8) == 0) { + } else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) { hw_cfg->bst_type = CS35L41_EXT_BOOST; hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH; hw_cfg->gpio1.valid = true; diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c index b11b7e5115dc..3033cd6ed3b4 100644 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ b/tools/testing/selftests/vm/ksm_functional_tests.c @@ -24,9 +24,12 @@ #define KiB 1024u #define MiB (1024 * KiB) +#define PageSize (4 * KiB) static int ksm_fd; static int ksm_full_scans_fd; +static int ksm_zero_pages_fd; +static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; @@ -57,6 +60,21 @@ static bool range_maps_duplicates(char *addr, unsigned long size) return false; } +static long ksm_get_zero_pages(void) +{ + char buf[20]; + ssize_t read_size; + unsigned long ksm_zero_pages; + + read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0); + if (read_size < 0) + return -errno; + buf[read_size] = 0; + ksm_zero_pages = strtol(buf, NULL, 10); + + return ksm_zero_pages; +} + static long ksm_get_full_scans(void) { char buf[10]; @@ -70,15 +88,12 @@ static long ksm_get_full_scans(void) return strtol(buf, NULL, 10); } -static int ksm_merge(void) +static int wait_two_full_scans(void) { long start_scans, end_scans; - /* Wait for two full scans such that any possible merging happened. */ start_scans = ksm_get_full_scans(); if (start_scans < 0) - return start_scans; - if (write(ksm_fd, "1", 1) != 1) return -errno; do { end_scans = ksm_get_full_scans(); @@ -89,6 +104,34 @@ static int ksm_merge(void) return 0; } +static inline int ksm_merge(void) +{ + /* Wait for two full scans such that any possible merging happened. */ + if (write(ksm_fd, "1", 1) != 1) + return -errno; + + return wait_two_full_scans(); +} + +static int unmerge_zero_page(char *start, unsigned long size) +{ + int ret; + + ret = madvise(start, size, MADV_UNMERGEABLE); + if (ret) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + return ret; + } + + /* + * Wait for two full scans such that any possible unmerging of zero + * pages happened. Why? Because the unmerge action of zero pages is not + * done in the context of madvise(), but in the context of + * unshare_zero_pages() of the ksmd thread. + */ + return wait_two_full_scans(); +} + static char *mmap_and_merge_range(char val, unsigned long size) { char *map; @@ -146,6 +189,48 @@ static void test_unmerge(void) munmap(map, size); } +static void test_unmerge_zero_pages(void) +{ + const unsigned int size = 2 * MiB; + char *map; + unsigned long pages_expected; + + ksft_print_msg("[RUN] %s\n", __func__); + + /* Confirm the interfaces*/ + if (ksm_zero_pages_fd < 0) { + ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n"); + return; + } + if (ksm_use_zero_pages_fd < 0) { + ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + + /* Mmap zero pages*/ + map = mmap_and_merge_range(0x00, size); + if (map == MAP_FAILED) + return; + + if (unmerge_zero_page(map + size / 2, size / 2)) + goto unmap; + + /* Check if zero_pages_sharing can be update correctly when unmerge */ + pages_expected = (size / 2) / PageSize; + ksft_test_result(pages_expected == ksm_get_zero_pages(), + "zero page count react to unmerge\n"); + + /* Check if ksm zero pages are really unmerged */ + ksft_test_result(!range_maps_duplicates(map + size / 2, size / 2), + "KSM zero pages were unmerged\n"); +unmap: + munmap(map, size); +} + static void test_unmerge_discarded(void) { const unsigned int size = 2 * MiB; @@ -264,8 +349,11 @@ int main(int argc, char **argv) pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY); + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); test_unmerge(); + test_unmerge_zero_pages(); test_unmerge_discarded(); #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); -- 2.40.0 From dc7a1604c3a8761a1f980e5dd5f1b2901b775a84 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 6 Apr 2023 17:34:29 +0200 Subject: [PATCH 07/15] fs-patches Signed-off-by: Peter Jung --- Documentation/admin-guide/xfs.rst | 2 +- Documentation/filesystems/ext4/blockgroup.rst | 6 +- block/blk-merge.c | 3 +- fs/btrfs/Makefile | 6 +- fs/btrfs/backref.c | 33 +- fs/btrfs/bio.c | 557 ++++++++++++-- fs/btrfs/bio.h | 67 +- fs/btrfs/block-group.c | 261 ++++++- fs/btrfs/block-group.h | 24 +- fs/btrfs/btrfs_inode.h | 23 +- fs/btrfs/compression.c | 276 ++----- fs/btrfs/compression.h | 3 - fs/btrfs/ctree.c | 62 +- fs/btrfs/ctree.h | 15 + fs/btrfs/defrag.c | 4 +- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/delayed-ref.c | 24 +- fs/btrfs/delayed-ref.h | 2 +- fs/btrfs/discard.c | 18 +- fs/btrfs/disk-io.c | 222 +----- fs/btrfs/disk-io.h | 14 +- fs/btrfs/extent-io-tree.c | 10 +- fs/btrfs/extent-io-tree.h | 1 - fs/btrfs/extent-tree.c | 181 ++--- fs/btrfs/extent-tree.h | 81 ++ fs/btrfs/extent_io.c | 582 ++------------ fs/btrfs/extent_io.h | 36 +- fs/btrfs/file-item.c | 72 +- fs/btrfs/file-item.h | 8 +- fs/btrfs/file.c | 13 +- fs/btrfs/free-space-tree.c | 2 +- fs/btrfs/fs.h | 5 +- fs/btrfs/inode.c | 715 ++++-------------- fs/btrfs/ioctl.c | 3 +- fs/btrfs/lru_cache.c | 166 ++++ fs/btrfs/lru_cache.h | 80 ++ fs/btrfs/lzo.c | 2 +- fs/btrfs/messages.c | 30 - fs/btrfs/messages.h | 34 - fs/btrfs/ordered-data.c | 71 +- fs/btrfs/ordered-data.h | 10 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/raid56.c | 334 +++----- fs/btrfs/raid56.h | 4 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/send.c | 684 +++++++++-------- fs/btrfs/super.c | 3 +- fs/btrfs/sysfs.c | 54 +- fs/btrfs/tests/extent-map-tests.c | 2 +- fs/btrfs/transaction.c | 29 + fs/btrfs/transaction.h | 31 + fs/btrfs/tree-log.c | 87 +-- fs/btrfs/tree-log.h | 9 +- fs/btrfs/volumes.c | 116 +-- fs/btrfs/volumes.h | 18 - fs/btrfs/zoned.c | 146 ++-- fs/btrfs/zoned.h | 20 +- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 2 +- fs/ext4/file.c | 34 +- fs/ext4/inode.c | 429 ++++------- fs/ext4/ioctl.c | 3 - fs/ext4/namei.c | 4 +- fs/ext4/page-io.c | 10 +- fs/ext4/super.c | 57 +- fs/ext4/sysfs.c | 4 +- fs/ext4/xattr.c | 126 +-- fs/gfs2/bmap.c | 38 +- fs/iomap/buffered-io.c | 91 ++- fs/iomap/direct-io.c | 10 +- fs/jbd2/journal.c | 9 +- fs/xfs/libxfs/xfs_alloc.c | 32 +- fs/xfs/libxfs/xfs_bmap.c | 32 +- fs/xfs/libxfs/xfs_bmap.h | 5 +- fs/xfs/libxfs/xfs_btree.c | 18 +- fs/xfs/libxfs/xfs_refcount.c | 96 ++- fs/xfs/libxfs/xfs_refcount.h | 4 +- fs/xfs/libxfs/xfs_rmap.c | 50 +- fs/xfs/libxfs/xfs_rmap.h | 6 +- fs/xfs/xfs_aops.c | 21 +- fs/xfs/xfs_bmap_item.c | 137 ++-- fs/xfs/xfs_error.c | 2 +- fs/xfs/xfs_error.h | 12 +- fs/xfs/xfs_extfree_item.c | 99 +-- fs/xfs/xfs_fsmap.c | 1 + fs/xfs/xfs_globals.c | 3 +- fs/xfs/xfs_iomap.c | 4 +- fs/xfs/xfs_qm.c | 40 +- fs/xfs/xfs_refcount_item.c | 110 ++- fs/xfs/xfs_rmap_item.c | 142 ++-- fs/xfs/xfs_sysfs.c | 12 +- fs/xfs/xfs_sysfs.h | 10 +- fs/xfs/xfs_trace.h | 15 +- include/linux/bio.h | 4 + include/linux/iomap.h | 30 +- include/linux/jbd2.h | 8 + include/trace/events/btrfs.h | 127 +++- include/trace/events/ext4.h | 7 - include/uapi/linux/btrfs.h | 12 +- 100 files changed, 3338 insertions(+), 3789 deletions(-) create mode 100644 fs/btrfs/lru_cache.c create mode 100644 fs/btrfs/lru_cache.h diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 8de008c0c5ad..e2561416391c 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -296,7 +296,7 @@ The following sysctls are available for the XFS filesystem: XFS_ERRLEVEL_LOW: 1 XFS_ERRLEVEL_HIGH: 5 - fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) + fs.xfs.panic_mask (Min: 0 Default: 0 Max: 511) Causes certain error conditions to call BUG(). Value is a bitmask; OR together the tags which represent errors which should cause panics: diff --git a/Documentation/filesystems/ext4/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index 46d78f860623..ed5a5cac6d40 100644 --- a/Documentation/filesystems/ext4/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst @@ -105,9 +105,9 @@ descriptors. Instead, the superblock and a single block group descriptor block is placed at the beginning of the first, second, and last block groups in a meta-block group. A meta-block group is a collection of block groups which can be described by a single block group descriptor -block. Since the size of the block group descriptor structure is 32 -bytes, a meta-block group contains 32 block groups for filesystems with -a 1KB block size, and 128 block groups for filesystems with a 4KB +block. Since the size of the block group descriptor structure is 64 +bytes, a meta-block group contains 16 block groups for filesystems with +a 1KB block size, and 64 block groups for filesystems with a 4KB blocksize. Filesystems can either be created using this new block group descriptor layout, or existing filesystems can be resized on-line, and the field s_first_meta_bg in the superblock will indicate the first diff --git a/block/blk-merge.c b/block/blk-merge.c index 808b58129d3e..1ac782fdc55c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -336,6 +336,7 @@ static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, bio_clear_polled(bio); return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } +EXPORT_SYMBOL_GPL(bio_split_rw); /** * __bio_split_to_limits - split a bio to fit the queue limits diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad6..90d53209755b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers @@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0d7ae20e39c9..e54f0884802a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return false; @@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; @@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return; @@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx ASSERT(level >= 0); if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); + gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); @@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .have_delayed_delete_refs = false, }; int level; + bool leaf_cached; + bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) @@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, walk_ctx.time_seq = elem.seq; } + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + walk_ctx.ignore_extent_item_pos = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; @@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - ctx->use_path_cache = true; while (1) { const unsigned long prev_ref_count = ctx->refs.nnodes; @@ -2005,6 +2031,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ctx->prev_extents_cache_slot = slot; } +out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index e775d68834ea..726592868e9c 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -14,19 +14,31 @@ #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" +#include "file-item.h" static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + atomic_set(&bbio->pending_ios, 1); } /* @@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, * a mempool. */ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private) { struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); + btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { + struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - struct btrfs_bio *bbio; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); + btrfs_bio(bio)->file_offset = orig_bbio->file_offset; + if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; + atomic_inc(&orig_bbio->pending_ios); return bio; } +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + bio_put(&bbio->bio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + bbio->end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(&repair_bbio->bio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) @@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bbio->end_io(bbio); + /* Metadata reads are checked and repaired by the submitter. */ + if (bbio->bio.bi_opf & REQ_META) + bbio->end_io(bbio); + else + btrfs_check_read_bio(bbio, bbio->bio.bi_private); } static void btrfs_simple_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); + btrfs_log_dev_io_error(bio, dev); if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); } } @@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } + /* Do not leak our private flag into the block layer. */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; if (!bioc) { - /* Single mirror read/write fast path */ + /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); + btrfs_submit_dev_bio(smap->dev, bio); } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ + /* Parity RAID write or read recovery. */ bio->bi_private = bioc; bio->bi_end_io = btrfs_raid56_end_io; if (bio_op(bio) == REQ_OP_READ) @@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror else raid56_parity_write(bio, bioc); } else { - /* Write to multiple mirrors */ + /* Write to multiple mirrors. */ int total_devs = bioc->num_stripes; - int dev_nr; bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) btrfs_submit_mirrored_bio(bioc, dev_nr); } } +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try to defer the submission to a workqueue to parallelize the + * checksum calculation. + */ + if (atomic_read(&bbio->inode->sync_writers)) + return false; + + /* + * Submit metadata writes synchronously if the checksum implementation + * is fast, or we are on a zoned device that wants I/O to be submitted + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) + return false; + } + + return true; +} + +/* + * Submit bio to an async queue. + * + * Return true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + if (op_is_sync(bbio->bio.bi_opf)) + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *orig_bbio = bbio; + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + bool use_append = btrfs_use_zone_append(bbio); + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + blk_status_t ret; + int error; + + btrfs_bio_counter_inc_blocked(fs_info); + error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (error) { + ret = errno_to_blk_status(error); + goto fail; + } + + map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); + bbio = btrfs_bio(bio); + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ + if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) + goto fail_put_bio; + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + if (ret) + goto fail_put_bio; + } + + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (!(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + goto done; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail_put_bio; + } + } + + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); +done: + return map_length == length; + +fail_put_bio: + if (map_length < length) + bio_put(bio); +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(orig_bbio, ret); + /* Do not submit another chunk */ + return true; +} + +void btrfs_submit_bio(struct bio *bio, int mirror_num) +{ + while (!btrfs_submit_chunk(bio, mirror_num)) + ; +} + /* * Submit a repair write. * @@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * - * The I/O is issued sychronously to block the repair read completion from + * The I/O is issued synchronously to block the repair read completion from * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, @@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) return -ENOMEM; + if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), 0)) + goto out_free_bioset; + if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_clone_bioset; + if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, + sizeof(struct btrfs_failed_bio))) + goto out_free_repair_bioset; return 0; + +out_free_repair_bioset: + bioset_exit(&btrfs_repair_bioset); +out_free_clone_bioset: + bioset_exit(&btrfs_clone_bioset); +out_free_bioset: + bioset_exit(&btrfs_bioset); + return -ENOMEM; } void __cold btrfs_bioset_exit(void) { + mempool_exit(&btrfs_failed_bio_pool); + bioset_exit(&btrfs_repair_bioset); + bioset_exit(&btrfs_clone_bioset); bioset_exit(&btrfs_bioset); } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index b12f84b3b341..873ff85817f0 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -26,32 +26,23 @@ struct btrfs_fs_info; typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. + * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and + * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ + /* Inode and offset into it that this I/O operates on. */ + struct btrfs_inode *inode; u64 file_offset; - /* @device is for stripe IO submission. */ - struct btrfs_device *device; union { - /* For data checksum verification. */ + /* + * Data checksumming and original I/O information for internal + * use in the btrfs_submit_bio machinery. + */ struct { u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + struct bvec_iter saved_iter; }; /* For metadata parentness verification. */ @@ -62,7 +53,9 @@ struct btrfs_bio { btrfs_bio_end_io_t end_io; void *private; - /* For read end I/O handling */ + /* For internal use in read end I/O handling */ + unsigned int mirror_num; + atomic_t pending_ios; struct work_struct end_io_work; /* @@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private); struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) bbio->end_io(bbio); } -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} +/* Bio only refers to one ordered extent. */ +#define REQ_BTRFS_ONE_ORDERED REQ_DRV -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct bio *bio, int mirror_num); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c70a888bf8bf..5fc670c27f86 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include "misc.h" #include "ctree.h" @@ -539,6 +540,143 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end return total_added; } +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *found_key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + u64 search_offset; + u64 search_end = block_group->start + block_group->length; + struct btrfs_path *path; + struct btrfs_key search_key; + int ret = 0; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + search_key.objectid = block_group->start + search_offset; + search_key.type = BTRFS_EXTENT_ITEM_KEY; + search_key.offset = 0; + + btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { + /* Success; sampled an extent item in the block group */ + if (found_key->type == BTRFS_EXTENT_ITEM_KEY && + found_key->objectid >= block_group->start && + found_key->objectid + found_key->offset <= search_end) + break; + + /* We can't possibly find a valid extent item anymore */ + if (found_key->objectid >= search_end) { + ret = 1; + break; + } + } + + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + btrfs_free_path(path); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (!btrfs_block_group_should_use_size_class(block_group)) + return 0; + + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } +out: + return ret; +} + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; @@ -683,6 +821,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { @@ -1811,7 +1950,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group - * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1822,8 +1960,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len) + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1863,9 +2000,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, data_stripe_length)) continue; - if (bdev && map->stripes[i].dev->bdev != bdev) - continue; - stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); @@ -1922,7 +2056,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, NULL, + ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -3387,6 +3521,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -3441,32 +3576,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4226,3 +4371,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cf..6e4a0b429ac3 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,7 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len); + u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 195c09e20609..87020aa58121 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,12 +93,6 @@ struct btrfs_inode { /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; - /* special utility tree used to record which mirrors have already been - * tried when checksums fail for a given block - */ - struct rb_root io_failure_tree; - spinlock_t io_failure_lock; - /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate @@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); @@ -532,6 +516,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ordered_extent **ordered_extent, size_t done_before); extern const struct dentry_operations btrfs_dentry_operations; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea..f42f31f22d13 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static void finish_compressed_bio_read(struct compressed_bio *cb) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { + struct compressed_bio *cb = bbio->private; unsigned int index; struct page *page; - if (cb->status == BLK_STS_OK) + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; + else cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); /* Release the compressed pages */ @@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); -} - -/* - * Verify the checksums and kick off repair if needed on the uncompressed data - * before decompressing it into the original bio and freeing the uncompressed - * pages. - */ -static void end_compressed_bio_read(struct btrfs_bio *bbio) -{ - struct compressed_bio *cb = bbio->private; - struct inode *inode = cb->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); - bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bbio->bio.bi_status; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (!status && - (!csum || !btrfs_check_data_csum(bi, bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(bi, start, bv.bv_page, - bv.bv_offset); - } else { - int ret; - - refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, - true); - if (ret) { - refcount_dec(&cb->pending_ios); - status = errno_to_blk_status(ret); - } - } - } - - if (status) - cb->status = status; - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_compressed_bio_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = bbio->private; - - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - - if (refcount_dec_and_test(&cb->pending_ios)) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - - btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - } - bio_put(&bbio->bio); -} - -/* - * Allocate a compressed_bio, which will be used to read/write on-disk - * (aka, compressed) * data. - * - * @cb: The compressed_bio structure, which records all the needed - * information to bind the compressed data to the uncompressed - * page cache. - * @disk_byten: The logical bytenr where the compressed data will be read - * from or written to. - * @endio_func: The endio function to call after the IO for compressed data - * is finished. - * @next_stripe_start: Return value of logical bytenr of where next stripe starts. - * Let the caller know to only fill the bio up to the stripe - * boundary. - */ - - -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, - btrfs_bio_end_io_t endio_func, - u64 *next_stripe_start) -{ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - struct btrfs_io_geometry geom; - struct extent_map *em; - struct bio *bio; - int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); - if (IS_ERR(em)) { - bio_put(bio); - return ERR_CAST(em); - } - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); - - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); - free_extent_map(em); - if (ret < 0) { - bio_put(bio); - return ERR_PTR(ret); - } - *next_stripe_start = disk_bytenr + geom.len; - refcount_inc(&cb->pending_ios); - return bio; + bio_put(&bbio->bio); } /* @@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct bio *bio = NULL; struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; - u64 next_stripe_start; blk_status_t ret = BLK_STS_OK; - int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; - const bool use_append = btrfs_use_zone_append(inode, disk_start); - const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - if (blkcg_css) + if (blkcg_css) { kthread_associate_blkcg(blkcg_css); + write_flags |= REQ_CGROUP_PUNT; + } + + write_flags |= REQ_BTRFS_ONE_ORDERED; + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, + BTRFS_I(cb->inode), end_compressed_bio_write, cb); + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = start; while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; @@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int real_size; unsigned int added; struct page *page = compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!bio) { - bio = alloc_compressed_bio(cb, cur_disk_bytenr, - bio_op | write_flags, end_compressed_bio_write, - &next_stripe_start); - if (IS_ERR(bio)) { - ret = errno_to_blk_status(PTR_ERR(bio)); - break; - } - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_bytenr != next_stripe_start); /* * We have various limits on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - if (use_append) - added = bio_add_zone_append_page(bio, page, real_size, - offset_in_page(offset)); - else - added = bio_add_page(bio, page, real_size, - offset_in_page(offset)); - /* Reached zoned boundary */ - if (added == 0) - submit = true; - + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); + /* + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. + */ + ASSERT(added == real_size); cur_disk_bytenr += added; - /* Reached stripe boundary */ - if (cur_disk_bytenr == next_stripe_start) - submit = true; - - /* Finished the range */ - if (cur_disk_bytenr == disk_start + compressed_len) - submit = true; - - if (submit) { - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - break; - } - } - - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, bio, 0); - bio = NULL; - } - cond_resched(); } + /* Finished the range. */ + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(bio, 0); if (blkcg_css) kthread_associate_blkcg(NULL); - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_write(cb); return ret; } @@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio = NULL; + struct bio *comp_bio; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; - u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; @@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; + comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), + end_compressed_bio_read, cb); + comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); + while (cur_disk_byte < disk_bytenr + compressed_len) { u64 offset = cur_disk_byte - disk_bytenr; unsigned int index = offset >> PAGE_SHIFT; unsigned int real_size; unsigned int added; struct page *page = cb->compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!comp_bio) { - comp_bio = alloc_compressed_bio(cb, cur_disk_byte, - REQ_OP_READ, end_compressed_bio_read, - &next_stripe_start); - if (IS_ERR(comp_bio)) { - cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); - break; - } - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_byte != next_stripe_start); + /* * We have various limit on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); @@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ ASSERT(added == real_size); cur_disk_byte += added; - - /* Reached stripe boundary, need to submit */ - if (cur_disk_byte == next_stripe_start) - submit = true; - - /* Has finished the range, need to submit */ - if (cur_disk_byte == disk_bytenr + compressed_len) - submit = true; - - if (submit) { - /* Save the original iter for read repair */ - if (bio_op(comp_bio) == REQ_OP_READ) - btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - - /* - * Save the initial offset of this chunk, as there - * is no direct correlation between compressed pages and - * the original file offset. The field is only used for - * priting error messages. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(comp_bio), ret); - break; - } - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, comp_bio, mirror_num); - comp_bio = NULL; - } } if (memstall) psi_memstall_leave(&pflags); - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); + /* + * Stash the initial offset of this chunk, as there is no direct + * correlation between compressed pages and the original file offset. + * The field is only used for printing error messages anyway. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; + + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(comp_bio, mirror_num); return; fail: @@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, index_end = end >> PAGE_SHIFT; /* Don't miss unaligned end */ - if (!IS_ALIGNED(end, PAGE_SIZE)) + if (!PAGE_ALIGNED(end)) index_end++; curr_sample_pos = 0; @@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * For now is's a naive and optimistic 'return true', we'll extend the logic to * quickly (compared to direct compression) detect data characteristics - * (compressible/uncompressible) to avoid wasting CPU time on uncompressible + * (compressible/incompressible) to avoid wasting CPU time on incompressible * data. * * The following types of analysis can be performed: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 6209d40a1e08..a5e3377db9ad 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of outstanding bios */ - refcount_t pending_ios; - /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4754c9101a4c..a5b6bb54545f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; @@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * Search for a key in the given extent_buffer. * - * The lower boundary for the search is specified by the slot number @low. Use a - * value of 0 to search over the whole extent buffer. + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, int low, - const struct btrfs_key *key, int *slot) +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; - int high = btrfs_header_nritems(eb); + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); - if (low > high) { + if (unlikely(low > high)) { btrfs_err(eb->fs_info, - "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", __func__, low, high, eb->start, btrfs_header_owner(eb), btrfs_header_level(eb)); return -EINVAL; @@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, return 1; } -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot) -{ - return generic_bin_search(eb, 0, key, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_generic_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, @@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (left_nritems) btrfs_mark_buffer_dirty(left); else - btrfs_clean_tree_block(left); + btrfs_clear_buffer_dirty(trans, left); btrfs_mark_buffer_dirty(right); @@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clean_tree_block(path->nodes[0]); + btrfs_clear_buffer_dirty(trans, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(path, min_data_size, empty, - right, free_space, left_nritems, min_slot); + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -3259,7 +3255,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) @@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (right_nritems) btrfs_mark_buffer_dirty(right); else - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EUCLEAN; goto out; } - return __push_leaf_left(path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6965703a81b6..97897107fab5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); + +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); + +/* + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). + */ +static inline int btrfs_bin_search(struct extent_buffer *eb, + const struct btrfs_key *key, + int *slot) +{ + return btrfs_generic_bin_search(eb, 0, key, slot); +} + int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d81b764a7644..8065341d831a 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i break; unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } #define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0095c6e4c3d1..6b457b010cbc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1048,7 +1048,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * so there is only one iref. The case that several irefs are * in the same item doesn't exist. */ - btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, path); out: btrfs_release_delayed_iref(node); btrfs_release_path(path); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 573ebab886e2..886ffb232eac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); } -static bool merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - drop_delayed_ref(trans, delayed_refs, head, next); + drop_delayed_ref(delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); + drop_delayed_ref(delayed_refs, head, ref); done = true; } else { /* @@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, return done; } -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -524,7 +521,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(trans, delayed_refs, head, ref, seq)) + if (merge_ref(delayed_refs, head, ref, seq)) goto again; } } @@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return 0 for insert. * Return >0 for merge. */ -static int insert_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, +static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { @@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(trans, root, href, exist); + drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); return ret; inserted: @@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d6304b690ec4..2eb34abf700f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 317aeff6c1da..c48abc817ed2 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -56,11 +56,9 @@ #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) -/* Target completion latency of discarding all discardable extents */ -#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) -#define BTRFS_DISCARD_MAX_IOPS (10U) +#define BTRFS_DISCARD_MAX_IOPS (1000U) /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { @@ -577,6 +575,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) s32 discardable_extents; s64 discardable_bytes; u32 iops_limit; + unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; unsigned long delay; discardable_extents = atomic_read(&discard_ctl->discardable_extents); @@ -607,13 +606,16 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) } iops_limit = READ_ONCE(discard_ctl->iops_limit); - if (iops_limit) + + if (iops_limit) { delay = MSEC_PER_SEC / iops_limit; - else - delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; + } else { + /* unset iops_limit means go as fast as possible, so allow a delay of 0 */ + delay = 0; + min_delay = 0; + } - delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, - BTRFS_DISCARD_MAX_DELAY_MSEC); + delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 174d196d6960..9e1596bb208d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) crypto_free_shash(fs_info->csum_shash); } -/* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct btrfs_inode *inode; - struct bio *bio; - enum btrfs_wq_submit_cmd submit_cmd; - int mirror_num; - - /* Optional parameter for used by direct io */ - u64 dio_file_offset; - struct btrfs_work work; - blk_status_t status; -}; - /* * Compute the csum of a btree block and store the result to provided buffer. */ @@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return csum_one_extent_buffer(eb); } +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct bvec_iter iter; + struct bio_vec bv; + int ret = 0; + + bio_for_each_segment(bv, &bbio->bio, iter) { + ret = csum_dirty_buffer(fs_info, &bv); + if (ret) + break; + } + + return errno_to_blk_status(ret); +} + static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -700,172 +699,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, return ret; } -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - switch (async->submit_cmd) { - case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->bio); - break; - case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); - break; - case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, - async->bio, async->dio_file_offset); - break; - } - if (ret) - async->status = ret; -} - -/* - * In order to insert checksums into the metadata in large chunks, we wait - * until bio submission time. All the pages in the bio are checksummed and - * sums are attached onto the ordered extent record. - * - * At IO completion time the csums attached on the ordered extent record are - * inserted into the tree. - */ -static void run_one_async_done(struct btrfs_work *work) -{ - struct async_submit_bio *async = - container_of(work, struct async_submit_bio, work); - struct btrfs_inode *inode = async->inode; - struct btrfs_bio *bbio = btrfs_bio(async->bio); - - /* If an error occurred we just want to clean up the bio and move on */ - if (async->status) { - btrfs_bio_end_io(bbio, async->status); - return; - } - - /* - * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. - */ - async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -/* - * Submit bio to an async queue. - * - * Retrun: - * - true if the work has been succesfuly submitted - * - false in case of error - */ -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return false; - - async->inode = inode; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_cmd = cmd; - - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); - - async->dio_file_offset = dio_file_offset; - - async->status = 0; - - if (op_is_sync(bio->bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); - return true; -} - -static blk_status_t btree_csum_one_bio(struct bio *bio) -{ - struct bio_vec *bvec; - struct btrfs_root *root; - int ret = 0; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec); - if (ret) - break; - } - - return errno_to_blk_status(ret); -} - -blk_status_t btree_submit_bio_start(struct bio *bio) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_submit_bio. - */ - return btree_csum_one_bio(bio); -} - -static bool should_async_write(struct btrfs_fs_info *fs_info, - struct btrfs_inode *bi) -{ - if (btrfs_is_zoned(fs_info)) - return false; - if (atomic_read(&bi->sync_writers)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - return true; -} - -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_bio *bbio = btrfs_bio(bio); - blk_status_t ret; - - bio->bi_opf |= REQ_META; - bbio->is_metadata = 1; - - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - - /* - * Kthread helpers are used to submit writes so that checksumming can - * happen in parallel across all CPUs. - */ - if (should_async_write(fs_info, inode) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) - return; - - ret = btree_csum_one_bio(bio); - if (ret) { - btrfs_bio_end_io(bbio, ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -void btrfs_clean_tree_block(struct extent_buffer *buf) -{ - struct btrfs_fs_info *fs_info = buf->fs_info; - if (btrfs_header_generation(buf) == - fs_info->running_transaction->transid) { - btrfs_assert_tree_write_locked(buf); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); - clear_extent_buffer_dirty(buf); - } - } -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -5176,11 +4993,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, start += fs_info->nodesize; if (!eb) continue; + + btrfs_tree_lock(eb); wait_on_extent_buffer_writeback(eb); + btrfs_clear_buffer_dirty(NULL, eb); + btrfs_tree_unlock(eb); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags)) - clear_extent_buffer_dirty(eb); free_extent_buffer_stale(eb); } } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2f295eb6103..4d5772330110 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, @@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, struct btrfs_tree_parent_check *check); -enum btrfs_wq_submit_cmd { - WQ_SUBMIT_METADATA, - WQ_SUBMIT_DATA, - WQ_SUBMIT_DATA_DIO, -}; - -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct bio *bio); +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 3c7766dfaa69..29a225836e28 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1625,7 +1625,7 @@ u64 count_range_bits(struct extent_io_tree *tree, } /* - * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * Search a range in the state tree for a given mask. If 'filled' == 1, this * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 * is returned if any bit in the range is found set. */ diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index e3eeec380844..21766e49ec02 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -6,7 +6,6 @@ #include "misc.h" struct extent_changeset; -struct io_failure_record; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9..824c657f59e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,7 +16,8 @@ #include #include #include -#include "misc.h" +#include "ctree.h" +#include "extent-tree.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -31,14 +32,12 @@ #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" -#include "block-group.h" #include "discard.h" #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" -#include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" @@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &actual_count); @@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - - /* * Helper function for find_free_extent(). * @@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,10 +3535,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && @@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { @@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4277,7 @@ static noinline int find_free_extent(struct btrfs_root *root, &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4319,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4421,6 +4344,9 @@ static noinline int find_free_extent(struct btrfs_root *root, if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4455,7 +4381,8 @@ static noinline int find_free_extent(struct btrfs_root *root, ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,8 +4395,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: @@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); __btrfs_tree_lock(buf, nest); - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); @@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } - btrfs_clean_tree_block(eb); + btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ae5425253603..0c958fc1b3b8 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,6 +3,87 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3bbf8703db2a..c25fa74d7615 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -36,6 +36,7 @@ #include "file.h" #include "dev-replace.h" #include "super.h" +#include "transaction.h" static struct kmem_cache *extent_buffer_cache; @@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { struct bio *bio; int mirror_num; enum btrfs_compression_type compress_type; - u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; @@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct btrfs_inode *inode; + struct inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = BTRFS_I(bv->bv_page->mapping->host); + inode = bv->bv_page->mapping->host; mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - - if (!is_data_inode(&inode->vfs_inode)) { + if (!is_data_inode(inode)) { if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, @@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - btrfs_submit_metadata_bio(inode, bio, mirror_num); - } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_submit_data_write_bio(inode, bio, mirror_num); - } else { - btrfs_submit_data_read_bio(inode, bio, mirror_num, - bio_ctrl->compress_type); + bio->bi_opf |= REQ_META; } + if (btrfs_op(bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + btrfs_submit_compressed_read(inode, bio, mirror_num); + else + btrfs_submit_bio(bio, mirror_num); + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } -static int insert_failrec(struct btrfs_inode *inode, - struct io_failure_record *failrec) -{ - struct rb_node *exist; - - spin_lock(&inode->io_failure_lock); - exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, - &failrec->rb_node); - spin_unlock(&inode->io_failure_lock); - - return (exist == NULL) ? 0 : -EEXIST; -} - -static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) -{ - struct rb_node *node; - struct io_failure_record *failrec = ERR_PTR(-ENOENT); - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search(&inode->io_failure_tree, start); - if (node) - failrec = rb_entry(node, struct io_failure_record, rb_node); - spin_unlock(&inode->io_failure_lock); - return failrec; -} - -static void free_io_failure(struct btrfs_inode *inode, - struct io_failure_record *rec) -{ - spin_lock(&inode->io_failure_lock); - rb_erase(&rec->rb_node, &inode->io_failure_tree); - spin_unlock(&inode->io_failure_lock); - - kfree(rec); -} - -static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == failrec->num_copies) - return cur_mirror + 1 - failrec->num_copies; - return cur_mirror + 1; -} - -static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == 1) - return failrec->num_copies; - return cur_mirror - 1; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - u64 ino = btrfs_ino(inode); - u64 locked_start, locked_end; - struct io_failure_record *failrec; - int mirror; - int ret; - - failrec = get_failrec(inode, start); - if (IS_ERR(failrec)) - return 0; - - BUG_ON(!failrec->this_mirror); - - if (sb_rdonly(fs_info->sb)) - goto out; - - ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, - &locked_end, EXTENT_LOCKED, NULL); - if (ret || locked_start > failrec->bytenr || - locked_end < failrec->bytenr + failrec->len - 1) - goto out; - - mirror = failrec->this_mirror; - do { - mirror = prev_mirror(failrec, mirror); - btrfs_repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, mirror); - } while (mirror != failrec->failed_mirror); - -out: - free_io_failure(inode, failrec); - return 0; -} - -/* - * Can be called when - * - hold extent lock - * - under ordered extent - * - the inode is freeing - */ -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct io_failure_record *failrec; - struct rb_node *node, *next; - - if (RB_EMPTY_ROOT(&inode->io_failure_tree)) - return; - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search_first(&inode->io_failure_tree, start); - while (node) { - failrec = rb_entry(node, struct io_failure_record, rb_node); - if (failrec->bytenr > end) - break; - - next = rb_next(node); - rb_erase(&failrec->rb_node, &inode->io_failure_tree); - kfree(failrec); - - node = next; - } - spin_unlock(&inode->io_failure_lock); -} - -static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - struct btrfs_bio *bbio, - unsigned int bio_offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - const u32 sectorsize = fs_info->sectorsize; - int ret; - - failrec = get_failrec(BTRFS_I(inode), start); - if (!IS_ERR(failrec)) { - btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->bytenr, failrec->len); - /* - * when data can be on disk more than twice, add to failrec here - * (e.g. with a list for failed_mirror) to make - * clean_io_failure() clean all those errors at once. - */ - ASSERT(failrec->this_mirror == bbio->mirror_num); - ASSERT(failrec->len == fs_info->sectorsize); - return failrec; - } - - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return ERR_PTR(-ENOMEM); - - RB_CLEAR_NODE(&failrec->rb_node); - failrec->bytenr = start; - failrec->len = sectorsize; - failrec->failed_mirror = bbio->mirror_num; - failrec->this_mirror = bbio->mirror_num; - failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - - btrfs_debug(fs_info, - "new io failure record logical %llu start %llu", - failrec->logical, start); - - failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); - if (failrec->num_copies == 1) { - /* - * We only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. No - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "cannot repair logical %llu num_copies %d", - failrec->logical, failrec->num_copies); - kfree(failrec); - return ERR_PTR(-EIO); - } - - /* Set the bits in the private failure tree */ - ret = insert_failrec(BTRFS_I(inode), failrec); - if (ret) { - kfree(failrec); - return ERR_PTR(ret); - } - - return failrec; -} - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered) -{ - u64 start = failed_bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *failed_bio = &failed_bbio->bio; - const int icsum = bio_offset >> fs_info->sectorsize_bits; - struct bio *repair_bio; - struct btrfs_bio *repair_bbio; - - btrfs_debug(fs_info, - "repair read error: read error at %llu", start); - - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - - failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); - if (IS_ERR(failrec)) - return PTR_ERR(failrec); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for btrfs_repair_io_failure to do the rest for us. - */ - failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); - if (failrec->this_mirror == failrec->failed_mirror) { - btrfs_debug(fs_info, - "failed to repair num_copies %d this_mirror %d failed_mirror %d", - failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(inode, failrec); - return -EIO; - } - - repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, - failed_bbio->private); - repair_bbio = btrfs_bio(repair_bio); - repair_bbio->file_offset = start; - repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - - if (failed_bbio->csum) { - const u32 csum_size = fs_info->csum_size; - - repair_bbio->csum = repair_bbio->csum_inline; - memcpy(repair_bbio->csum, - failed_bbio->csum + csum_size * icsum, csum_size); - } - - bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_bbio->iter = repair_bio->bi_iter; - - btrfs_debug(fs_info, - "repair read error: submitting new read to mirror %d", - failrec->this_mirror); - - /* - * At this point we have a bio, so any errors from bio submission will - * be handled by the endio on the repair_bio, so we can't return an - * error here. - */ - if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, - failrec->this_mirror, 0); - else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); - - return BLK_STS_OK; -} - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static void end_sector_io(struct page *page, u64 offset, bool uptodate) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - const u32 sectorsize = inode->root->fs_info->sectorsize; - - end_page_read(page, uptodate, offset, sectorsize); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); -} - -static void submit_data_read_repair(struct inode *inode, - struct btrfs_bio *failed_bbio, - u32 bio_offset, const struct bio_vec *bvec, - unsigned int error_bitmap) -{ - const unsigned int pgoff = bvec->bv_offset; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct page *page = bvec->bv_page; - const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; - const u64 end = start + bvec->bv_len - 1; - const u32 sectorsize = fs_info->sectorsize; - const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int i; - - BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); - - /* This repair is only for data */ - ASSERT(is_data_inode(inode)); - - /* We're here because we had some read errors or csum mismatch */ - ASSERT(error_bitmap); - - /* - * We only get called on buffered IO, thus page must be mapped and bio - * must not be cloned. - */ - ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); - - /* Iterate through all the sectors in the range */ - for (i = 0; i < nr_bits; i++) { - const unsigned int offset = i * sectorsize; - bool uptodate = false; - int ret; - - if (!(error_bitmap & (1U << i))) { - /* - * This sector has no error, just end the page read - * and unlock the range. - */ - uptodate = true; - goto next; - } - - ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, - bio_offset + offset, page, pgoff + offset, - true); - if (!ret) { - /* - * We have submitted the read repair, the page release - * will be handled by the endio function of the - * submitted repair bio. - * Thus we don't need to do any thing here. - */ - continue; - } - /* - * Continue on failed repair, otherwise the remaining sectors - * will not be properly unlocked. - */ -next: - end_sector_io(page, start + offset, uptodate); - } -} - /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) @@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) u64 start; u64 end; struct bvec_iter_all iter_all; - bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; - if (first_bvec) { - btrfs_record_physical_zoned(inode, start, bio); - first_bvec = false; - } - end_extent_writepage(page, error, start, end); btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); @@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - unsigned int error_bitmap = (unsigned int)-1; - bool repair = false; u64 start; u64 end; u32 len; @@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) len = bvec->bv_len; mirror = bbio->mirror_num; - if (likely(uptodate)) { - if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(bbio, - bio_offset, page, start, end); - if (error_bitmap) - uptodate = false; - } else { - if (btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror)) - uptodate = false; - } - } + if (uptodate && !is_data_inode(inode) && + btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) + uptodate = false; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); - /* * Zero out the remaining part if this range straddles * i_size. @@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (is_data_inode(inode)) { - /* - * Only try to repair bios that actually made it to a - * device. If the bio failed to be submitted mirror - * is 0 and we need to fail it without retrying. - * - * This also includes the high level bios for compressed - * extents - these never make it to a device and repair - * is already handled on the lower compressed bio. - */ - if (mirror > 0) - repair = true; - } else { + } else if (!is_data_inode(inode)) { struct extent_buffer *eb; eb = find_extent_buffer_readpage(fs_info, page, start); @@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) atomic_dec(&eb->io_pages); } - if (repair) { - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bbio, bio_offset, bvec, - error_bitmap); - } else { - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); - } + /* Update page status and unlock. */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig = false; - int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + ASSERT(bio_ctrl->len_to_oe_boundary); if (bio_ctrl->compress_type != compress_type) return 0; @@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (!contig) return 0; - real_size = min(bio_ctrl->len_to_oe_boundary, - bio_ctrl->len_to_stripe_boundary) - bio_size; - real_size = min(real_size, size); + real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); /* * If real_size is 0, never call bio_add_*_page(), as even size is 0, @@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (real_size == 0) return 0; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); - else - ret = bio_add_page(bio, page, real_size, pg_offset); - - return ret; + return bio_add_page(bio, page, real_size, pg_offset); } -static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_io_geometry geom; struct btrfs_ordered_extent *ordered; - struct extent_map *em; - u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); - int ret; /* - * Pages for compressed extent are never submitted to disk directly, - * thus it has no real boundary, just set them to U32_MAX. - * - * The split happens for real compressed bio, which happens in - * btrfs_submit_compressed_read/write(). + * Limit the extent to the ordered boundary for Zone Append. + * Compressed bios aren't submitted directly, so it doesn't apply to + * them. */ - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - bio_ctrl->len_to_stripe_boundary = U32_MAX; - return 0; - } - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), - logical, &geom); - free_extent_map(em); - if (ret < 0) { - return ret; - } - if (geom.len > U32_MAX) - bio_ctrl->len_to_stripe_boundary = U32_MAX; - else - bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - - if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; - } - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!ordered) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; + if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && + btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + btrfs_put_ordered_extent(ordered); + return; + } } - bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, - ordered->disk_bytenr + ordered->disk_num_bytes - logical); - btrfs_put_ordered_extent(ordered); - return 0; + bio_ctrl->len_to_oe_boundary = U32_MAX; } -static int alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, - blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, blk_opf_t opf, + u64 disk_bytenr, u32 offset, u64 file_offset, + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; - int ret; - ASSERT(bio_ctrl->end_io_func); - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, + NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = file_offset; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); - if (ret < 0) - goto error; + calc_bio_boundaries(bio_ctrl, inode, file_offset); if (wbc) { /* - * For Zone append we need the correct block_device that we are - * going to write to set in the bio to be able to respect the - * hardware limitation. Look it up here: + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *dev; - - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto error; - } - - bio_set_dev(bio, dev->bdev); - } else { - /* - * Otherwise pick the last added device to support - * cgroup writeback. For multi-device file systems this - * means blk-cgroup policies have to always be set on the - * last added/replaced device. This is a bit odd but has - * been like that for a long time. - */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - } + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, bio); - } else { - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } - return 0; -error: - bio_ctrl->bio = NULL; - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return ret; } /* @@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, enum btrfs_compression_type compress_type, bool force_bio_submit) { - int ret = 0; struct btrfs_inode *inode = BTRFS_I(page->mapping->host); unsigned int cur = pg_offset; @@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - disk_bytenr, offset, - page_offset(page) + cur, - compress_type); - if (ret < 0) - return ret; + alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, + offset, page_offset(page) + cur, + compress_type); } /* * We must go through btrfs_bio_add_page() to ensure each @@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * find_next_dirty_byte() are all exclusive */ iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - - if (btrfs_use_zone_append(inode, em->block_start)) - op = REQ_OP_ZONE_APPEND; - free_extent_map(em); em = NULL; @@ -2360,13 +1910,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) */ mapping_set_error(page->mapping, -EIO); - /* - * If we error out, we should add back the dirty_metadata_bytes - * to make it consistent. - */ - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - eb->len, fs_info->dirty_metadata_batch); - /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. @@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -void clear_extent_buffer_dirty(const struct extent_buffer *eb) +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; int i; int num_pages; struct page *page; + btrfs_assert_tree_write_locked(eb); + + if (trans && btrfs_header_generation(eb) != trans->transid) + return; + + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + return; + + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, + fs_info->dirty_metadata_batch); + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c82448b2e0..4341ad978fb8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,6 +11,8 @@ #include "ulist.h" #include "misc.h" +struct btrfs_trans_handle; + enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -60,11 +62,9 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_fs_info; -struct io_failure_record; struct extent_io_tree; struct btrfs_tree_parent_check; @@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); @@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the sector is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - /* Use rb_simple_node for search/insert */ - struct { - struct rb_node rb_node; - u64 bytenr; - }; - struct page *page; - u64 len; - u64 logical; - int this_mirror; - int failed_mirror; - int num_copies; -}; - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5de73466b2ca..41c77a100853 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, /* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. - * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_bio *bbio = NULL; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_bytenr; - u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; blk_status_t ret = BLK_STS_OK; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; @@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst if (!path) return BLK_STS_RESOURCE; - if (!dst) { - bbio = btrfs_bio(bio); - - if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } - } else { - bbio->csum = bbio->csum_inline; + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { + btrfs_free_path(path); + return BLK_STS_RESOURCE; } - csum = bbio->csum; } else { - csum = dst; + bbio->csum = bbio->csum_inline; } /* @@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> fs_info->sectorsize_bits; - csum_dst = csum + sector_offset * csum_size; + csum_dst = bbio->csum + sector_offset * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); - if (bbio) - btrfs_bio_free_csum(bbio); + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + bbio->csum = NULL; break; } @@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst memset(csum_dst, 0, csum_size); count = 1; - if (BTRFS_I(inode)->root->root_key.objectid == + if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; int ret; - ret = search_file_offset_in_bio(bio, inode, + ret = search_file_offset_in_bio(bio, + &inode->vfs_inode, cur_disk_bytenr, &file_offset); if (ret) set_extent_bits(io_tree, file_offset, @@ -784,23 +772,16 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, /* * Calculate checksums of the data contained inside a bio. - * - * @inode: Owner of the data inside the bio - * @bio: Contains the data to be checksummed - * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the - * file offsets are determined from the page offsets in the bio. - * Otherwise, this is the starting file offset of the bio vecs in - * @bio, which must be contiguous. - * @one_ordered: If true, @bio only refers to one ordered extent. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered) +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; - const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; @@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (use_page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; - if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); /* @@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - 1); for (i = 0; i < blockcount; i++) { - if (!one_ordered && + if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && !in_range(offset, ordered->file_offset, ordered->num_bytes)) { unsigned long bytes_left; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 031225668434..cd7f2ae515c0 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af046d22300e..ec5c5355906b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -1465,6 +1465,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t err; unsigned int ilock_flags = 0; struct iomap_dio *dio; + struct btrfs_ordered_extent *ordered_extent = NULL; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1526,7 +1527,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) * got -EFAULT, faulting in the pages before the retry. */ from->nofault = true; - dio = btrfs_dio_write(iocb, from, written); + dio = btrfs_dio_write(iocb, from, &ordered_extent, written); from->nofault = false; /* @@ -1569,6 +1570,14 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) goto relock; } } + /* + * We can't loop back to btrfs_dio_write, so we can drop the cached + * ordered extent. Typically btrfs_dio_iomap_end will run and put the + * ordered_extent, but this is needed to clean up in case of an error + * path breaking out of iomap_iter before the final iomap_end call. + */ + if (ordered_extent) + btrfs_put_ordered_extent(ordered_extent); /* * If 'err' is -ENOTBLK or we have not written all data, then it means diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c667e878ef1a..4d155a48ec59 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - btrfs_clean_tree_block(free_space_root->node); + btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index f180ca061aef..24cd49229408 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,7 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include #include #include #include @@ -745,8 +746,10 @@ struct btrfs_fs_info { */ u64 zone_size; - /* Max size to emit ZONE_APPEND write command */ + /* Constraints for ZONE_APPEND commands: */ + struct queue_limits limits; u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a800b8bd43..6aaa892474be 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -81,30 +81,16 @@ struct btrfs_dio_data { struct extent_changeset *data_reserved; bool data_space_reserved; bool nocow_done; + struct btrfs_ordered_extent *ordered; }; struct btrfs_dio_private { - struct btrfs_inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ + /* Range of I/O */ u64 file_offset; - /* Used for bio::bi_size */ u32 bytes; - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* Array of checksums */ - u8 *csums; - /* This must be last */ - struct bio bio; + struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; @@ -228,7 +214,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start, page_end; + u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { @@ -2535,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -/* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) -{ - return btrfs_csum_one_bio(inode, bio, (u64)-1, false); -} - /* * Split an extent_map at [start, start + len] * @@ -2663,19 +2636,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, return ret; } -static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - struct bio *bio, loff_t file_offset) +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) { + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_inode *inode = bbio->inode; struct btrfs_ordered_extent *ordered; - u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 file_len; - u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; u64 pre, post; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); + ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; @@ -2715,7 +2688,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; - ret = split_zoned_em(inode, file_offset, file_len, pre, post); + ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); out: btrfs_put_ordered_extent(ordered); @@ -2723,75 +2696,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(inode, bio, - page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - - /* - * If we need to checksum, and the I/O is not issued by fsync and - * friends, that is ->sync_writers != 0, defer the submission to a - * workqueue to parallelize it. - * - * Csum items for reloc roots have already been cloned at this point, - * so they are handled as part of the no-checksum case. - */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { - if (!atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) - return; - - ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - btrfs_submit_bio(fs_info, bio, mirror_num); -} - -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing the bio - * if there were any errors, so just return here. - */ - btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); - return; - } - - /* Save the original iter for read repair */ - btrfs_bio(bio)->iter = bio->bi_iter; - - /* - * Lookup bio sums does extra checks around whether we need to csum or - * not, which is why we ignore skip_sum here. - */ - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2969,7 +2873,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -3259,15 +3163,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid bdev implies a write on a sequential zone */ - if (ordered_extent->bdev) { + /* A valid ->physical implies a write on a sequential zone. */ + if (ordered_extent->physical != (u64)-1) { btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } - btrfs_free_io_failure_record(inode, start, end); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -3474,109 +3376,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of } /* - * check_data_csum - verify checksum of one sector of uncompressed data - * @inode: inode - * @bbio: btrfs_bio which contains the csum + * Verify the checksum of a single data sector. + * + * @bbio: btrfs_io_bio which contains the csum + * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @page: page where is the data to be verified - * @pgoff: offset inside the page + * @bv: bio_vec to check * - * The length of such check is always one sector size. + * Check if the checksum on a data block is valid. When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero. * - * When csum mismatch is detected, we will also report the error and fill the - * corrupted range with zero. (Thus it needs the extra parameters) + * Return %true if the sector is ok or had no checksum to start with, else %false. */ -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff) +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len = fs_info->sectorsize; + u64 file_offset = bbio->file_offset + bio_offset; + u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - ASSERT(pgoff + len <= PAGE_SIZE); + ASSERT(bv->bv_len == fs_info->sectorsize); - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (!bbio->csum) + return true; - if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) + if (btrfs_is_data_reloc_root(inode->root) && + test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); + return true; + } + + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) goto zeroit; - return 0; + return true; zeroit: - btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, - csum, csum_expected, bbio->mirror_num); - if (bbio->device) - btrfs_dev_stat_inc_and_print(bbio->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_page(page, pgoff, len); - return -EIO; -} - -/* - * When reads are done, we need to check csums to verify the data is correct. - * if there's a match, we allow the bio to finish. If not, the code in - * extent_io.c will try to find good copies for us. - * - * @bio_offset: offset to the beginning of the bio (in bytes) - * @start: file offset of the range start - * @end: file offset of the range end (inclusive) - * - * Return a bitmap where bit set means a csum mismatch, and bit not set means - * csum match. - */ -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - const u32 sectorsize = root->fs_info->sectorsize; - u32 pg_off; - unsigned int result = 0; - - /* - * This only happens for NODATASUM or compressed read. - * Normally this should be covered by above check for compressed read - * or the next check for NODATASUM. Just do a quicker exit here. - */ - if (bbio->csum == NULL) - return 0; - - if (inode->flags & BTRFS_INODE_NODATASUM) - return 0; - - if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) - return 0; - - ASSERT(page_offset(page) <= start && - end <= page_offset(page) + PAGE_SIZE - 1); - for (pg_off = offset_in_page(start); - pg_off < offset_in_page(end); - pg_off += sectorsize, bio_offset += sectorsize) { - u64 file_offset = pg_off + page_offset(page); - int ret; - - if (btrfs_is_data_reloc_root(root) && - test_range_bit(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, 1, NULL)) { - /* Skip the range without csum for data reloc inode */ - clear_extent_bits(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); - continue; - } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); - if (ret < 0) { - const int nr_bit = (pg_off - offset_in_page(start)) >> - root->fs_info->sectorsize_bits; - - result |= (1U << nr_bit); - } - } - return result; + btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, + bbio->mirror_num); + if (dev) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + memzero_bvec(bv); + return false; } /* @@ -4987,7 +4835,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -5466,8 +5314,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; @@ -7131,6 +6977,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, const u64 start, const u64 len, const u64 orig_start, @@ -7141,7 +6988,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const int type) { struct extent_map *em = NULL; - int ret; + struct btrfs_ordered_extent *ordered; if (type != BTRFS_ORDERED_NOCOW) { em = create_io_em(inode, start, len, orig_start, block_start, @@ -7151,18 +6998,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, - block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (ret) { + ordered = btrfs_alloc_ordered_extent(inode, start, len, len, + block_start, block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { if (em) { free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + len - 1, false); } - em = ERR_PTR(ret); + em = ERR_PTR(PTR_ERR(ordered)); + } else { + ASSERT(!dio_data->ordered); + dio_data->ordered = ordered; } out: @@ -7170,6 +7020,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, u64 start, u64 len) { struct btrfs_root *root = inode->root; @@ -7185,7 +7036,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, if (ret) return ERR_PTR(ret); - em = btrfs_create_dio_extent(inode, start, ins.offset, start, + em = btrfs_create_dio_extent(inode, dio_data, + start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -7392,7 +7244,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7530,7 +7382,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } space_reserved = true; - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); @@ -7572,7 +7424,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; space_reserved = true; - em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -7676,6 +7528,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } } + if (dio_data->ordered) { + ASSERT(write); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + dio_data->ordered->file_offset, + dio_data->ordered->bytes_left); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto err; + } + goto map_iomap; + } memset(dio_data, 0, sizeof(*dio_data)); /* @@ -7817,6 +7680,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, else free_extent_state(cached_state); +map_iomap: /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does @@ -7833,10 +7697,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) - iomap->flags |= IOMAP_F_ZONE_APPEND; - free_extent_map(em); return 0; @@ -7874,13 +7734,25 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (submitted < length) { pos += submitted; length -= submitted; - if (write) - btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, - pos, length, false); - else + if (write) { + if (submitted == 0) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), + NULL, pos, + length, false); + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } + } else { unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); + } ret = -ENOTBLK; + } else { + /* On the last bio, release our cached ordered_extent. */ + if (write) { + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } } if (write) @@ -7888,267 +7760,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, return ret; } -static void btrfs_dio_private_put(struct btrfs_dio_private *dip) -{ - /* - * This implies a barrier so that stores to dio_bio->bi_status before - * this and loads of dio_bio->bi_status after this are fully ordered. - */ - if (!refcount_dec_and_test(&dip->refs)) - return; - - if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(dip->inode, NULL, - dip->file_offset, dip->bytes, - !dip->bio.bi_status); - } else { - unlock_extent(&dip->inode->io_tree, - dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - kfree(dip->csums); - bio_endio(&dip->bio); -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) +static void btrfs_dio_end_io(struct btrfs_bio *bbio) { - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - - BUG_ON(bio_op(bio) == REQ_OP_WRITE); - - refcount_inc(&dip->refs); - btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); -} - -static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, - struct btrfs_bio *bbio, - const bool uptodate) -{ - struct inode *inode = &dip->inode->vfs_inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - blk_status_t err = BLK_STS_OK; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (uptodate && - (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(BTRFS_I(inode), start, - bv.bv_page, bv.bv_offset); - } else { - int ret; - - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, false); - if (ret) - err = errno_to_blk_status(ret); - } - } - - return err; -} - -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset) -{ - return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - -static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = bbio->private; + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; - blk_status_t err = bio->bi_status; - - if (err) - btrfs_warn(dip->inode->root->fs_info, - "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), - bio->bi_opf, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - - if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, bbio, !err); - if (err) - dip->bio.bi_status = err; - - btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); - - bio_put(bio); - btrfs_dio_private_put(dip); -} - -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - blk_status_t ret; - - /* Save the original iter for read repair */ - if (btrfs_op(bio) == BTRFS_MAP_READ) - btrfs_bio(bio)->iter = bio->bi_iter; - - if (inode->flags & BTRFS_INODE_NODATASUM) - goto map; + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, - WQ_SUBMIT_DATA_DIO)) - return; + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, + dip->bytes, !bio->bi_status); + else + unlock_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } else { - btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, - file_offset - dip->file_offset); - } -map: - btrfs_submit_bio(fs_info, bio, 0); + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); } -static void btrfs_submit_direct(const struct iomap_iter *iter, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) { + struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = - container_of(dio_bio, struct btrfs_dio_private, bio); - struct inode *inode = iter->inode; - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const bool raid56 = (btrfs_data_alloc_profile(fs_info) & - BTRFS_BLOCK_GROUP_RAID56_MASK); - struct bio *bio; - u64 start_sector; - int async_submit = 0; - u64 submit_len; - u64 clone_offset = 0; - u64 clone_len; - u64 logical; - int ret; - blk_status_t status; - struct btrfs_io_geometry geom; + container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - struct extent_map *em = NULL; - - dip->inode = BTRFS_I(inode); - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - refcount_set(&dip->refs, 1); - dip->csums = NULL; - - if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - unsigned int nr_sectors = - (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - - /* - * Load the csums up front to reduce csum tree searches and - * contention when submitting bios. - */ - status = BLK_STS_RESOURCE; - dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip->csums) - goto out_err; - - status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); - if (status != BLK_STS_OK) - goto out_err; - } - - start_sector = dio_bio->bi_iter.bi_sector; - submit_len = dio_bio->bi_iter.bi_size; - - do { - logical = start_sector << 9; - em = btrfs_get_chunk_map(fs_info, logical, submit_len); - if (IS_ERR(em)) { - status = errno_to_blk_status(PTR_ERR(em)); - em = NULL; - goto out_err_em; - } - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, &geom); - if (ret) { - status = errno_to_blk_status(ret); - goto out_err_em; - } - - clone_len = min(submit_len, geom.len); - ASSERT(clone_len <= UINT_MAX); - - /* - * This will never fail as it's passing GPF_NOFS and - * the allocation is backed by btrfs_bioset. - */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, - btrfs_end_dio_bio, dip); - btrfs_bio(bio)->file_offset = file_offset; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - status = extract_ordered_extent(BTRFS_I(inode), bio, - file_offset); - if (status) { - bio_put(bio); - goto out_err; - } - } - - ASSERT(submit_len >= clone_len); - submit_len -= clone_len; - - /* - * Increase the count before we submit the bio so we know - * the end IO handler won't happen before we increase the - * count. Otherwise, the dip might get freed before we're - * done setting it up. - * - * We transfer the initial reference to the last bio, so we - * don't need to increment the reference count for the last one. - */ - if (submit_len > 0) { - refcount_inc(&dip->refs); - /* - * If we are submitting more than one bio, submit them - * all asynchronously. The exception is RAID 5 or 6, as - * asynchronous checksums make it difficult to collect - * full stripe writes. - */ - if (!raid56) - async_submit = 1; - } - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); + btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + bbio->file_offset = file_offset; - dio_data->submitted += clone_len; - clone_offset += clone_len; - start_sector += clone_len >> 9; - file_offset += clone_len; - - free_extent_map(em); - } while (submit_len > 0); - return; + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; -out_err_em: - free_extent_map(em); -out_err: - dio_bio->bi_status = status; - btrfs_dio_private_put(dip); + dio_data->submitted += bio->bi_iter.bi_size; + btrfs_submit_bio(bio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8157,25 +7809,30 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { }; static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_submit_direct, + .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { - struct btrfs_dio_data data; + struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); } struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ordered_extent **ordered_extent, size_t done_before) { - struct btrfs_dio_data data; + struct btrfs_dio_data dio_data = { .ordered = *ordered_extent }; + struct iomap_dio *dio; - return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + dio = __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &dio_data, done_before); + if (!IS_ERR_OR_NULL(dio)) + *ordered_extent = dio_data.ordered; + return dio; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -8552,7 +8209,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -8850,7 +8507,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); - spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8870,7 +8526,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8994,7 +8649,7 @@ int __init btrfs_init_cachep(void) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bio), + offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) goto fail; @@ -10289,65 +9944,13 @@ struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; - bool skip_csum; }; -static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, - struct bio *bio, int mirror_num) -{ - struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (!priv->skip_csum) { - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) - return ret; - } - - atomic_inc(&priv->pending); - btrfs_submit_bio(fs_info, bio, mirror_num); - return BLK_STS_OK; -} - -static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) -{ - const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->private; - struct btrfs_inode *inode = priv->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - u32 bio_offset = 0; - - if (priv->skip_csum || !uptodate) - return bbio->bio.bi_status; - - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; - for (i = 0; i < nr_sectors; i++) { - ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(inode, bbio, bio_offset, - bvec->bv_page, pgoff)) - return BLK_STS_IOERR; - bio_offset += sectorsize; - pgoff += sectorsize; - } - } - return BLK_STS_OK; -} - static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; - blk_status_t status; - status = btrfs_encoded_read_verify_csum(bbio); - if (status) { + if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the @@ -10356,11 +9959,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ - WRITE_ONCE(priv->status, status); + WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -10368,47 +9970,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .inode = inode, .file_offset = file_offset, .pending = ATOMIC_INIT(1), - .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), }; unsigned long i = 0; u64 cur = 0; - int ret; init_waitqueue_head(&priv.wait); - /* - * Submit bios for the extent, splitting due to bio or stripe limits as - * necessary. - */ + /* Submit bios for the extent, splitting due to bio limits as necessary. */ while (cur < disk_io_size) { - struct extent_map *em; - struct btrfs_io_geometry geom; struct bio *bio = NULL; - u64 remaining; + u64 remaining = disk_io_size - cur; - em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, - disk_io_size - cur); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - } else { - ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, - disk_bytenr + cur, &geom); - free_extent_map(em); - } - if (ret) { - WRITE_ONCE(priv.status, errno_to_blk_status(ret)); - break; - } - remaining = min(geom.len, disk_io_size - cur); while (bio || remaining) { size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + inode, btrfs_encoded_read_endio, &priv); bio->bi_iter.bi_sector = @@ -10417,14 +9998,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!bytes || bio_add_page(bio, pages[i], bytes, 0) < bytes) { - blk_status_t status; - - status = submit_encoded_read_bio(inode, bio, 0); - if (status) { - WRITE_ONCE(priv.status, status); - bio_put(bio); - goto out; - } + atomic_inc(&priv.pending); + btrfs_submit_bio(bio, 0); bio = NULL; continue; } @@ -10435,7 +10010,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } } -out: if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ @@ -10995,9 +10569,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; - first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, - PAGE_SIZE) >> PAGE_SHIFT; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c232636ecdfe..f9dbf41fc803 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * exists). */ btrfs_tree_lock(leaf); - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); @@ -2859,6 +2859,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); if (dev->name) strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); else diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c new file mode 100644 index 000000000000..0fe0ae54ac67 --- /dev/null +++ b/fs/btrfs/lru_cache.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "lru_cache.h" +#include "messages.h" + +/* + * Initialize a cache object. + * + * @cache: The cache. + * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. + */ +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) +{ + INIT_LIST_HEAD(&cache->lru_list); + mt_init(&cache->entries); + cache->size = 0; + cache->max_size = max_size; +} + +static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, + u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + list_for_each_entry(entry, head, list) { + if (entry->key == key && entry->gen == gen) + return entry; + } + + return NULL; +} + +/* + * Lookup for an entry in the cache. + * + * @cache: The cache. + * @key: The key of the entry we are looking for. + * @gen: Generation associated to the key. + * + * Returns the entry associated with the key or NULL if none found. + */ +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen) +{ + struct list_head *head; + struct btrfs_lru_cache_entry *entry; + + head = mtree_load(&cache->entries, key); + if (!head) + return NULL; + + entry = match_entry(head, key, gen); + if (entry) + list_move_tail(&entry->lru_list, &cache->lru_list); + + return entry; +} + +/* + * Remove an entry from the cache. + * + * @cache: The cache to remove from. + * @entry: The entry to remove from the cache. + * + * Note: this also frees the memory used by the entry. + */ +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry) +{ + struct list_head *prev = entry->list.prev; + + ASSERT(cache->size > 0); + ASSERT(!mtree_empty(&cache->entries)); + + list_del(&entry->list); + list_del(&entry->lru_list); + + if (list_empty(prev)) { + struct list_head *head; + + /* + * If previous element in the list entry->list is now empty, it + * means it's a head entry not pointing to any cached entries, + * so remove it from the maple tree and free it. + */ + head = mtree_erase(&cache->entries, entry->key); + ASSERT(head == prev); + kfree(head); + } + + kfree(entry); + cache->size--; +} + +/* + * Store an entry in the cache. + * + * @cache: The cache. + * @entry: The entry to store. + * + * Returns 0 on success and < 0 on error. + */ +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp) +{ + const u64 key = new_entry->key; + struct list_head *head; + int ret; + + head = kmalloc(sizeof(*head), gfp); + if (!head) + return -ENOMEM; + + ret = mtree_insert(&cache->entries, key, head, gfp); + if (ret == 0) { + INIT_LIST_HEAD(head); + list_add_tail(&new_entry->list, head); + } else if (ret == -EEXIST) { + kfree(head); + head = mtree_load(&cache->entries, key); + ASSERT(head != NULL); + if (match_entry(head, key, new_entry->gen) != NULL) + return -EEXIST; + list_add_tail(&new_entry->list, head); + } else if (ret < 0) { + kfree(head); + return ret; + } + + if (cache->max_size > 0 && cache->size == cache->max_size) { + struct btrfs_lru_cache_entry *lru_entry; + + lru_entry = list_first_entry(&cache->lru_list, + struct btrfs_lru_cache_entry, + lru_list); + btrfs_lru_cache_remove(cache, lru_entry); + } + + list_add_tail(&new_entry->lru_list, &cache->lru_list); + cache->size++; + + return 0; +} + +/* + * Empty a cache. + * + * @cache: The cache to empty. + * + * Removes all entries from the cache. + */ +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) +{ + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) + btrfs_lru_cache_remove(cache, entry); + + ASSERT(cache->size == 0); + ASSERT(mtree_empty(&cache->entries)); +} diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h new file mode 100644 index 000000000000..de3e18bce24a --- /dev/null +++ b/fs/btrfs/lru_cache.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_LRU_CACHE_H +#define BTRFS_LRU_CACHE_H + +#include +#include + +/* + * A cache entry. This is meant to be embedded in a structure of a user of + * this module. Similar to how struct list_head and struct rb_node are used. + * + * Note: it should be embedded as the first element in a struct (offset 0), and + * this module assumes it was allocated with kmalloc(), so it calls kfree() when + * it needs to free an entry. + */ +struct btrfs_lru_cache_entry { + struct list_head lru_list; + u64 key; + /* + * Optional generation associated to a key. Use 0 if not needed/used. + * Entries with the same key and different generations are stored in a + * linked list, so use this only for cases where there's a small number + * of different generations. + */ + u64 gen; + /* + * The maple tree uses unsigned long type for the keys, which is 32 bits + * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to + * use something like inode numbers as keys, which are always a u64, we + * have to deal with this in a special way - we store the key in the + * entry itself, as a u64, and the values inserted into the maple tree + * are linked lists of entries - so in case we are on a 64 bits system, + * that list always has a single entry, while on 32 bits systems it + * may have more than one, with each entry having the same value for + * their lower 32 bits of the u64 key. + */ + struct list_head list; +}; + +struct btrfs_lru_cache { + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + unsigned int size; + /* Maximum number of entries the cache can have. */ + unsigned int max_size; +}; + +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + +static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) +{ + return cache->size; +} + +static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +{ + return cache->size >= cache->max_size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen); +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp); +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry); +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); + +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d5e78cbc8fbc..71f6d8302d50 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* Check if we have reached page boundary */ - if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + if (PAGE_ALIGNED(cur_in)) { put_page(page_in); page_in = NULL; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 625bbbbb2608..fde5aaa6e7c9 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) } #endif -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} - /* * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an * alert, and either panics or BUGs, depending on mount options. diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 190af1f698d9..8c516ee58ff9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -6,7 +6,6 @@ #include struct btrfs_fs_info; -struct btrfs_trans_handle; static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char * __attribute_const__ btrfs_decode_error(int errno); -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_ERR \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_err((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 57d8c72737e1..1848d0d1a9c4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The - * tree is given a single reference on the ordered extent that was inserted. + * tree is given a single reference on the ordered extent that was inserted, and + * the returned pointer is given a second reference. * - * Return: 0 or -ENOMEM. + * Return: the new ordered extent or ERR_PTR(-ENOMEM). */ -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned flags, - int compress_type) +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) - return ret; + return ERR_PTR(ret); ret = 0; } else { /* @@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, */ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); if (ret < 0) - return ret; + return ERR_PTR(ret); } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) - return -ENOMEM; + return ERR_PTR(-ENOMEM); entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); + /* One ref for the returned entry to match semantics of lookup. */ + refcount_inc(&entry->refs); + + return entry; +} + +/* + * Add a new btrfs_ordered_extent for the range, but drop the reference instead + * of returning it to the caller. + */ +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) +{ + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, + ram_bytes, disk_bytenr, + disk_num_bytes, offset, flags, + compress_type); + + if (IS_ERR(ordered)) + return PTR_ERR(ordered); + btrfs_put_ordered_extent(ordered); + return 0; } @@ -616,7 +644,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); complete(&ordered->completion); } @@ -716,13 +744,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* - * Used to start IO or wait for a given ordered extent to finish. + * Start IO and wait for a given ordered extent to finish. * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -744,12 +771,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); - if (wait) { - if (!freespace_inode) - btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } /* @@ -800,7 +825,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1061,7 +1086,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 89f82b78f590..18007f9c00ad 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,6 @@ struct btrfs_ordered_extent { * command in a workqueue context */ u64 physical; - struct block_device *bdev; }; static inline void @@ -179,15 +178,20 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned flags, + u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index abf2b7f14307..f41da7ac360d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - btrfs_clean_tree_block(quota_root->node); + btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ff4b1d583788..642828c1b299 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* - * Return the total numer of errors found in the vertical stripe of @sector_nr. + * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. @@ -1183,7 +1183,15 @@ static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, trace_info->stripe_nr = -1; } -/* Generate PQ for one veritical stripe. */ +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +/* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; @@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; @@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, return 0; error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); + bio_list_put(bio_list); return -EIO; } @@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Uptodate directly for + * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, @@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; int i; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; /* @@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) wake_up(&rbio->io_wait); } -static void submit_read_bios(struct btrfs_raid_bio *rbio, +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; @@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } submit_bio(bio); } -} - -static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) -{ - struct bio *bio; - int total_sector_nr; - int ret = 0; - - ASSERT(bio_list_size(bio_list) == 0); - - /* - * Build a list of bios to read all sectors (including data and P/Q). - * - * This behaviro is to compensate the later csum verification and - * recovery. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; - - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; - } - return 0; -cleanup: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) @@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - ret = PTR_ERR(rbio); - goto fail; + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - goto queue_rbio; - - cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - return; } -queue_rbio: + /* * Either we don't have any existing plug, or we're doing a full stripe, - * can queue the rmw work now. + * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); - - return; - -fail: - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); } static int verify_one_sector(struct btrfs_raid_bio *rbio, @@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* - * No errors in the veritical stripe, skip it. Can happen for recovery + * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) @@ -1949,14 +1914,25 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) return ret; } -static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret < 0) - goto error; + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } } - return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - - return -EIO; -} - -static int recover_rbio(struct btrfs_raid_bio *rbio) -{ - struct bio_list bio_list; - struct bio *bio; - int ret; - - /* - * Either we're doing recover for a read failure or degraded write, - * caller should have set error bitmap correctly. - */ - ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); - bio_list_init(&bio_list); - - /* For recovery, we need to read all sectors including P/Q. */ - ret = alloc_rbio_pages(rbio); - if (ret < 0) - goto out; - - index_rbio_pages(rbio); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); - out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) @@ -2204,11 +2134,9 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; /* * Fill the data csums we need for data verification. We need to fill @@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ fill_data_csums(rbio); - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ - ret = recover_sectors(rbio); - return ret; -out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) @@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) return false; } -static int rmw_rbio(struct btrfs_raid_bio *rbio) +static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) - return ret; + goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ - if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) - goto write; - - /* - * Now we're doing sub-stripe write, also need all data stripes to do - * the full RMW. - */ - ret = alloc_rbio_data_pages(rbio); - if (ret < 0) - return ret; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; - index_rbio_pages(rbio); + index_rbio_pages(rbio); - ret = rmw_read_wait_recover(rbio); - if (ret < 0) - return ret; + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } -write: /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe @@ -2356,7 +2290,7 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) - return ret; + goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); @@ -2373,32 +2307,22 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) break; } } - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* @@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2637,8 +2560,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) return 0; cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); return ret; } @@ -2733,15 +2655,12 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) return ret; } -static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret) - goto error; + if (ret) { + bio_list_put(&bio_list); + return ret; + } } + + submit_read_wait_bio_list(rbio, &bio_list); return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; } -static int scrub_rbio(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; - struct bio_list bio_list; int sector_nr; int ret; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = scrub_assemble_read_bios(rbio, &bio_list); + ret = scrub_assemble_read_bios(rbio); if (ret < 0) - goto cleanup; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) - goto cleanup; + goto out; /* * We have every sector properly prepared. Can finish the scrub @@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) break; } } - return ret; - -cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - ret = scrub_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 7c73a443939e..df0e0abdeb1f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -65,7 +65,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* * Checksum buffer if the rbio is for data. The buffer should cover - * all data sectors (exlcuding P/Q sectors). + * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ec4a7658ce..ef13a9d4e370 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ - if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a5d026041be4..69c93ae333f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -229,7 +229,7 @@ struct full_stripe_lock { }; #ifndef CONFIG_64BIT -/* This structure is for archtectures whose (void *) is smaller than u64 */ +/* This structure is for architectures whose (void *) is smaller than u64 */ struct scrub_page_private { u64 logical; }; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d50182b6deec..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -32,6 +32,7 @@ #include "file-item.h" #include "ioctl.h" #include "verity.h" +#include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -80,23 +81,23 @@ struct clone_root { bool found_ref; }; -#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 -#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +#define SEND_MAX_NAME_CACHE_SIZE 256 /* - * Limit the root_ids array of struct backref_cache_entry to 12 elements. - * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds - * to the send root. Having the user specify more than 11 clone roots is not + * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of - * cloning roots that lead down to a leaf is more than 12. + * cloning roots that lead down to a leaf is more than 17. */ -#define SEND_MAX_BACKREF_CACHE_ROOTS 12 +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. - * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding - * maple tree's internal nodes, is 16K. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 @@ -107,15 +108,31 @@ struct clone_root { * x86_64). */ struct backref_cache_entry { - /* List to link to the cache's lru list. */ - struct list_head list; - /* The key for this entry in the cache. */ - u64 key; + struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -174,9 +191,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; - struct list_head name_cache_list; - int name_cache_size; + struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we @@ -285,13 +300,11 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; - struct { - u64 last_reloc_trans; - struct list_head lru_list; - struct maple_tree entries; - /* Number of entries stored in the cache. */ - int size; - } backref_cache; + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -321,21 +334,15 @@ struct orphan_dir_info { u64 ino; u64 gen; u64 last_dir_index_offset; + u64 dir_high_seq_ino; }; struct name_cache_entry { - struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * The key in the entry is an inode number, and the generation matches + * the inode's generation. */ - struct list_head radix_list; - u64 ino; - u64 gen; + struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; @@ -344,6 +351,9 @@ struct name_cache_entry { char name[]; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 @@ -956,14 +966,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, return 0; } -static void empty_backref_cache(struct send_ctx *sctx) -{ - struct backref_cache_entry *entry; - struct backref_cache_entry *tmp; - - list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) - kfree(entry); - - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mtree_destroy(&sctx->backref_cache.entries); - sctx->backref_cache.size = 0; -} - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { @@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (sctx->backref_cache.size == 0) + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) return false; /* @@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { - empty_backref_cache(sctx); + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); return false; } - entry = mtree_load(&sctx->backref_cache.entries, key); - if (!entry) + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) return false; + entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; - list_move_tail(&entry->list, &sctx->backref_cache.lru_list); return true; } @@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { @@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. */ - - if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { - struct backref_cache_entry *lru_entry; - struct backref_cache_entry *mt_entry; - - lru_entry = list_first_entry(&sctx->backref_cache.lru_list, - struct backref_cache_entry, list); - mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); - ASSERT(mt_entry == lru_entry); - list_del(&mt_entry->list); - kfree(mt_entry); - sctx->backref_cache.size--; - } - - ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, - new_entry, GFP_NOFS); + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ @@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, return; } - list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); - /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (sctx->backref_cache.size == 0) - sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; - - sctx->backref_cache.size++; + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, @@ -1886,7 +1868,8 @@ enum inode_state { inode_state_did_delete, }; -static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; @@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; @@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { @@ -1953,14 +1940,15 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) return ret; } -static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; - ret = get_cur_inode_state(sctx, ino, gen); + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) goto out; @@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { - int ret = 0; - u64 gen; + int ret; + u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) - goto out; + return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. */ - if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->parent_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; /* * Check if the overwritten ref was already processed. If yes, the ref @@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) - goto out; + return ret; - ret = 1; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; - } else { - ret = 0; + return 1; } -out: - return ret; + return 0; } /* @@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 ino, u64 ino_gen, const char *name, int name_len) { - int ret = 0; - u64 gen; + int ret; u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) - goto out; + return ret; - if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->send_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { + if (ret == -ENOENT) { /* was never and will never be overwritten */ - ret = 0; - goto out; + return 0; + } else if (ret < 0) { + return ret; } - ret = get_inode_gen(sctx->send_root, ow_inode, &gen); - if (ret < 0) - goto out; + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; - if (ow_inode == ino && gen == ino_gen) { - ret = 0; - goto out; + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; } /* @@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ - if ((ow_inode < sctx->send_progress) || - (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && - gen == sctx->cur_inode_gen)) - ret = 1; - else - ret = 0; + if (ow_inode < sctx->send_progress) + return 1; -out: - return ret; + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; } /* @@ -2285,113 +2264,16 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) return ret; } -/* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, - * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. - * In case of error, nce is kfreed. - */ -static int name_cache_insert(struct send_ctx *sctx, - struct name_cache_entry *nce) +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) { - int ret = 0; - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); - if (!nce_head) { - kfree(nce); - return -ENOMEM; - } - INIT_LIST_HEAD(nce_head); - - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) { - kfree(nce_head); - kfree(nce); - return ret; - } - } - list_add_tail(&nce->radix_list, nce_head); - list_add_tail(&nce->list, &sctx->name_cache_list); - sctx->name_cache_size++; - - return ret; -} + struct btrfs_lru_cache_entry *entry; -static void name_cache_delete(struct send_ctx *sctx, - struct name_cache_entry *nce) -{ - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - btrfs_err(sctx->send_root->fs_info, - "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", - nce->ino, sctx->name_cache_size); - } - - list_del(&nce->radix_list); - list_del(&nce->list); - sctx->name_cache_size--; - - /* - * We may not get to the final release of nce_head if the lookup fails - */ - if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); - kfree(nce_head); - } -} - -static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, - u64 ino, u64 gen) -{ - struct list_head *nce_head; - struct name_cache_entry *cur; - - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); - if (!nce_head) + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { - if (cur->ino == ino && cur->gen == gen) - return cur; - } - return NULL; -} - -/* - * Remove some entries from the beginning of name_cache_list. - */ -static void name_cache_clean_unused(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) - return; - - while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } -} - -static void name_cache_free(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - while (!list_empty(&sctx->name_cache_list)) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } + return container_of(entry, struct name_cache_entry, entry); } /* @@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct name_cache_entry *nce = NULL; + struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same @@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { - name_cache_delete(sctx, nce); - kfree(nce); + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { - /* - * Removes the entry from the list and adds it back to - * the end. This marks the entry as recently used so - * that name_cache_clean_unused does not remove it. - */ - list_move_tail(&nce->list, &sctx->name_cache_list); - *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ - ret = is_inode_existent(sctx, ino, gen); + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) goto out; @@ -2497,8 +2371,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out; } - nce->ino = ino; - nce->gen = gen; + nce->entry.key = ino; + nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); @@ -2510,10 +2384,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, else nce->need_later_update = 1; - nce_ret = name_cache_insert(sctx, nce); - if (nce_ret < 0) + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); ret = nce_ret; - name_cache_clean_unused(sctx); + } out: return ret; @@ -2883,6 +2758,63 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) return ret; } +/* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + /* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode @@ -2971,6 +2903,23 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) return ret; } +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. @@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) struct btrfs_key di_key; struct btrfs_dir_item *di; + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; + cache_dir_created(sctx, dir); break; } } @@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) return 0; } - return send_create_inode(sctx, sctx->cur_ino); + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; } struct recorded_ref { @@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); @@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; @@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. @@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - odi = get_orphan_dir_info(sctx, dir, dir_gen); - if (odi) - key.offset = odi->last_dir_index_offset; + key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; @@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } - if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; + if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } @@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, out: btrfs_free_path(path); - return ret; + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) @@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } gen = odi->gen; - ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); + ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) @@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3619,7 +3628,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) @@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * the source path when performing its rename * operation. */ - if (is_waiting_for_move(sctx, ow_inode)) { - wdm = get_waiting_dir_move(sctx, - ow_inode); - ASSERT(wdm); + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) wdm->orphanized = true; - } /* * Make sure we clear our orphanized inode's @@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); - if (nce) { - name_cache_delete(sctx, nce); - kfree(nce); - } + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); /* * ow_inode might currently be an ancestor of @@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { @@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; + cache_dir_created(sctx, cur->dir); } } @@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { @@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && cur->dir != last_dir_ino_rm) { - ret = can_rmdir(sctx, cur->dir, cur->dir_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { @@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in @@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); - INIT_LIST_HEAD(&sctx->name_cache_list); - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mt_init(&sctx->backref_cache.entries); + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; @@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->pending_dir_moves = RB_ROOT; - sctx->waiting_dir_moves = RB_ROOT; - sctx->orphan_dirs = RB_ROOT; - sctx->rbtree_new_refs = RB_ROOT; - sctx->rbtree_deleted_refs = RB_ROOT; - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, GFP_KERNEL); @@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8358,11 +8389,12 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); - name_cache_free(sctx); - close_current_inode(sctx); - empty_backref_cache(sctx); + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3f3c8f9186f9..366fb4cde145 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -58,6 +58,7 @@ #include "scrub.h" #include "verity.h" #include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include @@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } /* - * Metadata in mixed block goup profiles are accounted in data + * Metadata in mixed block group profiles are accounted in data */ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 108aa3876186..37fc58a7f27e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "messages.h" #include "ctree.h" @@ -702,7 +703,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -static struct kobj_type btrfs_raid_ktype = { +static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -778,6 +779,45 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, return len; } +static ssize_t btrfs_size_classes_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + struct btrfs_block_group *bg; + u32 none = 0; + u32 small = 0; + u32 medium = 0; + u32 large = 0; + + for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { + down_read(&sinfo->groups_sem); + list_for_each_entry(bg, &sinfo->block_groups[i], list) { + if (!btrfs_block_group_should_use_size_class(bg)) + continue; + switch (bg->size_class) { + case BTRFS_BG_SZ_NONE: + none++; + break; + case BTRFS_BG_SZ_SMALL: + small++; + break; + case BTRFS_BG_SZ_MEDIUM: + medium++; + break; + case BTRFS_BG_SZ_LARGE: + large++; + break; + } + } + up_read(&sinfo->groups_sem); + } + return sysfs_emit(buf, "none %u\n" + "small %u\n" + "medium %u\n" + "large %u\n", + none, small, medium, large); +} + #ifdef CONFIG_BTRFS_DEBUG /* * Request chunk allocation with current chunk size. @@ -835,6 +875,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, @@ -887,6 +928,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, chunk_size), + BTRFS_ATTR_PTR(space_info, size_classes), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif @@ -900,7 +942,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -static struct kobj_type space_info_ktype = { +static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -1259,7 +1301,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) complete(&fs_devs->kobj_unregister); } -static struct kobj_type btrfs_ktype = { +static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; @@ -1789,7 +1831,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) complete(&device->kobj_unregister); } -static struct kobj_type devid_ktype = { +static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, @@ -2103,7 +2145,7 @@ static void qgroups_release(struct kobject *kobj) kfree(kobj); } -static struct kobj_type qgroups_ktype = { +static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, @@ -2173,7 +2215,7 @@ static void qgroup_release(struct kobject *kobj) memset(&qgroup->kobj, 0, sizeof(*kobj)); } -static struct kobj_type qgroup_ktype = { +static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c5b3a631bf4f..f2f2e11dac4c 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9094e2402922..b8d5b1fa9a03 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2622,6 +2622,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 97f6c39f59c8..fa728ab80826 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); @@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 58599189bd18..200cea6e49e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part @@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, bytenr, blocksize); if (ret) { @@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add( trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, bytenr); } } @@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, path->nodes[*level]->start, path->nodes[*level]->len); @@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, path->nodes[*level]->start); } @@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, next->start, next->len); if (ret) goto out; btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, next->start); } } @@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, /* * If for some unexpected reason the last item's index is not greater - * than the last index we logged, warn and return an error to fallback - * to a transaction commit. + * than the last index we logged, warn and force a transaction commit. */ if (WARN_ON(last_index <= inode->last_dir_index_offset)) - ret = -EUCLEAN; + ret = BTRFS_LOG_FORCE_COMMIT; else inode->last_dir_index_offset = last_index; out: @@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - int err = 0; int ret; u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; @@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; + } else if (ret > 0) { + ret = 0; } goto done; @@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; } else if (ret < 0) { - err = ret; goto done; } @@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret > 0) + if (ret > 0) { ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } if (ret < 0) - err = ret; - /* If ret is 1, there are no more keys in the inode's root. */ - if (ret != 0) goto done; /* @@ -3897,8 +3887,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, &last_old_dentry_offset); if (ret != 0) { - if (ret < 0) - err = ret; + if (ret > 0) + ret = 0; goto done; } path->slots[0] = btrfs_header_nritems(path->nodes[0]); @@ -3909,10 +3899,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ ret = btrfs_next_leaf(root, path); if (ret) { - if (ret == 1) + if (ret == 1) { last_offset = (u64)-1; - else - err = ret; + ret = 0; + } goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); @@ -3943,7 +3933,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - if (err == 0) { + if (ret == 0) { *last_offset_ret = last_offset; /* * In case the leaf was changed in the current transaction but @@ -3954,15 +3944,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * a range, last_old_dentry_offset is == to last_offset. */ ASSERT(last_old_dentry_offset <= last_offset); - if (last_old_dentry_offset < last_offset) { + if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, last_offset); - if (ret) - err = ret; - } } - return err; + + return ret; } /* @@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction * commits. */ - if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { - btrfs_set_log_full_commit(trans); + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - } inode = btrfs_iget(root->fs_info->sb, ino, root); /* @@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * result in losing the file after a log replay. */ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { - btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85cd24cb0540..bdeb5216718f 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -13,8 +13,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 -/* We can't use the tree log for whatever reason, force a transaction commit */ -#define BTRFS_LOG_FORCE_COMMIT (1) +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fe652f8fc697..3195e33831b7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( /* * Handle the case where the scanned device is part of an fs whose last * metadata UUID change reverted it to the original FSID. At the same - * time * fs_devices was first created by another constitutent device + * time fs_devices was first created by another constituent device * which didn't fully observe the operation. This results in an * btrfs_fs_devices created with metadata/fsid different AND * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the @@ -6300,91 +6300,42 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } -/* - * Calculate the geometry of a particular (address, len) tuple. This - * information is used to calculate how big a particular bio can get before it - * straddles a stripe. - * - * @fs_info: the filesystem - * @em: mapping containing the logical extent - * @op: type of operation - write or read - * @logical: address that we want to figure out the geometry of - * @io_geom: pointer used to return values - * - * Returns < 0 in case a chunk for the given logical address cannot be found, - * usually shouldn't happen unless @logical is corrupted, 0 otherwise. - */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom) +static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) { - struct map_lookup *map; - u64 len; - u64 offset; - u64 stripe_offset; - u64 stripe_nr; - u32 stripe_len; - u64 raid56_full_stripe_start = (u64)-1; - int data_stripes; + u32 stripe_len = map->stripe_len; ASSERT(op != BTRFS_MAP_DISCARD); - map = em->map_lookup; - /* Offset of this logical address in the chunk */ - offset = logical - em->start; - /* Len of a stripe in a chunk */ - stripe_len = map->stripe_len; /* - * Stripe_nr is where this block falls in - * stripe_offset is the offset of this block in its stripe. + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. */ - stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); - ASSERT(stripe_offset < U32_MAX); + *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + ASSERT(*stripe_offset < U32_MAX); - data_stripes = nr_data_stripes(map); + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - /* Only stripe based profiles needs to check against stripe length. */ - if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { - u64 max_len = stripe_len - stripe_offset; + *full_stripe_start = + div64_u64(offset, full_stripe_len) * full_stripe_len; /* - * In case of raid56, we need to know the stripe aligned start + * For writes to RAID56, allow to write a full stripe set, but + * no straddling of stripe sets. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * data_stripes; - raid56_full_stripe_start = offset; - - /* - * Allow a write of a full stripe, but make sure we - * don't allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - - /* - * For writes to RAID[56], allow a full stripeset across - * all disks. For other RAID types and for RAID[56] - * reads, just allow a single stripe (on a single disk). - */ - if (op == BTRFS_MAP_WRITE) { - max_len = stripe_len * data_stripes - - (offset - raid56_full_stripe_start); - } - } - len = min_t(u64, em->len - offset, max_len); - } else { - len = em->len - offset; + if (op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - *full_stripe_start); } - io_geom->len = len; - io_geom->offset = offset; - io_geom->stripe_len = stripe_len; - io_geom->stripe_nr = stripe_nr; - io_geom->stripe_offset = stripe_offset; - io_geom->raid56_stripe_offset = raid56_full_stripe_start; - - return 0; + /* + * For other RAID types and for RAID56 reads, allow a single stripe (on + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) + return stripe_len - *stripe_offset; + return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, @@ -6403,6 +6354,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, { struct extent_map *em; struct map_lookup *map; + u64 map_offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6421,7 +6373,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - struct btrfs_io_geometry geom; + u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); @@ -6429,18 +6381,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); - if (ret < 0) - return ret; - map = em->map_lookup; - - *length = geom.len; - stripe_len = geom.stripe_len; - stripe_nr = geom.stripe_nr; - stripe_offset = geom.stripe_offset; - raid56_full_stripe_start = geom.raid56_stripe_offset; data_stripes = nr_data_stripes(map); + stripe_len = map->stripe_len; + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, + &stripe_offset, &raid56_full_stripe_start); + *length = min_t(u64, em->len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6b7a05f6cf82..7e51f2238f72 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -53,21 +53,6 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; -struct btrfs_io_geometry { - /* remaining bytes before crossing a stripe */ - u64 len; - /* offset of logical address in chunk */ - u64 offset; - /* length of single IO stripe */ - u32 stripe_len; - /* offset of address in stripe */ - u32 stripe_offset; - /* number of stripe where address falls */ - u64 stripe_nr; - /* offset of raid56 stripe into the chunk */ - u64 raid56_stripe_offset; -}; - /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ce5ebba7fdd9..45d04092f2f8 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "bio.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, */ static inline u32 sb_zone_number(int shift, int mirror) { - u64 zone; + u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { @@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; - u32 zno; int ret; if (!*nr_zones) @@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* Check cache */ if (zinfo->zone_cache) { unsigned int i; + u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; @@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return -EIO; /* Populate cache */ - if (zinfo->zone_cache) + if (zinfo->zone_cache) { + u32 zno = pos >> zinfo->zone_size_shift; + memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); + } return 0; } @@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); - /* - * We limit max_zone_append_size also by max_segments * - * PAGE_SIZE. Technically, we can have multiple pages per segment. But, - * since btrfs adds the pages one by one to a bio, and btrfs cannot - * increase the metadata reservation even if it increases the number of - * extents, it is safe to stick with the limit. - * - * With the zoned emulation, we can have non-zoned device on the zoned - * mode. In this case, we don't have a valid max zone append size. So, - * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. - */ - if (bdev_is_zoned(bdev)) { - zone_info->max_zone_append_size = min_t(u64, - (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); - } else { - zone_info->max_zone_append_size = - (u64)bdev_max_segments(bdev) << PAGE_SHIFT; - } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -714,9 +699,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { + struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; - u64 max_zone_append_size = 0; int ret; /* @@ -726,6 +711,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); + blk_set_stacking_limits(lim); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; @@ -740,10 +727,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) zone_info->zone_size, zone_size); return -EINVAL; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = zone_info->max_zone_append_size; + + /* + * With the zoned emulation, we can have non-zoned device on the + * zoned mode. In this case, we don't have a valid max zone + * append size. + */ + if (bdev_is_zoned(device->bdev)) { + blk_stack_limits(lim, + &bdev_get_queue(device->bdev)->limits, + 0); + } } /* @@ -764,8 +758,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, - fs_info->sectorsize); + /* + * Also limit max_zone_append_size by max_segments * PAGE_SIZE. + * Technically, we can have multiple pages per segment. But, since + * we add the pages one by one to a bio, and cannot increase the + * metadata reservation even if it increases the number of extents, it + * is safe to stick with the limit. + */ + fs_info->max_zone_append_size = ALIGN_DOWN( + min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, + (u64)lim->max_sectors << SECTOR_SHIFT, + (u64)lim->max_segments << PAGE_SHIFT), + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; if (fs_info->max_zone_append_size < fs_info->max_extent_size) fs_info->max_extent_size = fs_info->max_zone_append_size; @@ -1632,8 +1636,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +bool btrfs_use_zone_append(struct btrfs_bio *bbio) { + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; @@ -1644,6 +1650,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) + return false; + /* * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the * extent layout the relocation code has. @@ -1666,22 +1675,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) return ret; } -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio) +void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { + const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_extent *ordered; - const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - if (bio_op(bio) != REQ_OP_ZONE_APPEND) - return; - - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); if (WARN_ON(!ordered)) return; ordered->physical = physical; - ordered->bdev = bio->bi_bdev; - btrfs_put_ordered_extent(ordered); } @@ -1693,43 +1696,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) struct extent_map *em; struct btrfs_ordered_sum *sum; u64 orig_logical = ordered->disk_bytenr; - u64 *logical = NULL; - int nr, stripe_len; + struct map_lookup *map; + u64 physical = ordered->physical; + u64 chunk_start_phys; + u64 logical; - /* Zoned devices should not have partitions. So, we can assume it is 0 */ - ASSERT(!bdev_is_partition(ordered->bdev)); - if (WARN_ON(!ordered->bdev)) + em = btrfs_get_chunk_map(fs_info, orig_logical, 1); + if (IS_ERR(em)) return; + map = em->map_lookup; + chunk_start_phys = map->stripes[0].physical; - if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, - ordered->physical, &logical, &nr, - &stripe_len))) - goto out; - - WARN_ON(nr != 1); + if (WARN_ON_ONCE(map->num_stripes > 1) || + WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || + WARN_ON_ONCE(physical < chunk_start_phys) || + WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { + free_extent_map(em); + return; + } + logical = em->start + (physical - map->stripes[0].physical); + free_extent_map(em); - if (orig_logical == *logical) - goto out; + if (orig_logical == logical) + return; - ordered->disk_bytenr = *logical; + ordered->disk_bytenr = logical; em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = *logical; + em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); list_for_each_entry(sum, &ordered->list, list) { - if (*logical < orig_logical) - sum->bytenr -= orig_logical - *logical; + if (logical < orig_logical) + sum->bytenr -= orig_logical - logical; else - sum->bytenr += *logical - orig_logical; + sum->bytenr += logical - orig_logical; } - -out: - kfree(logical); } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -1854,26 +1860,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; - /* We only support single profile for now */ - device = map->stripes[0].dev; - - free_extent_map(em); - - return device; -} - /* * Activate block group and underlying device zones * diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f43990985d80..c0570d35fea2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio); +bool btrfs_use_zone_append(struct btrfs_bio *bbio); +void btrfs_record_physical_zoned(struct btrfs_bio *bbio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, @@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); @@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; } -static inline void btrfs_record_physical_zoned(struct inode *inode, - u64 file_offset, struct bio *bio) +static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } @@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } -static inline struct btrfs_device *btrfs_zoned_get_device( - struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6479146140d2..9dec84fff826 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1387,7 +1387,7 @@ struct ext4_super_block { __le32 s_first_meta_bg; /* First metablock block group */ __le32 s_mkfs_time; /* When the filesystem was created */ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ - /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ + /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ /*150*/ __le32 s_blocks_count_hi; /* Blocks count */ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ __le32 s_free_blocks_count_hi; /* Free blocks count */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9de1c9d1a13d..3559ea6b0781 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err != -ENOSPC && err != -EDQUOT) + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7ac0a81bd371..6e9f198ecacf 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len) return false; } -/* Is IO overwriting allocated and initialized blocks? */ -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +/* Is IO overwriting allocated or initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, + loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); + if (err != blklen) + return false; /* * 'err==len' means that all of the blocks have been preallocated, - * regardless of whether they have been initialized or not. To exclude - * unwritten extents, we need to check m_flags. + * regardless of whether they have been initialized or not. We need to + * check m_flags to distinguish the unwritten extents. */ - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); + return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * - * - shared locking will only be true mostly with overwrites. Otherwise we will - * switch to exclusive i_rwsem lock. + * - shared locking will only be true mostly with overwrites, including + * initialized blocks and unwritten blocks. For overwrite unwritten blocks + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can + * also release exclusive i_rwsem lock. + * + * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, - bool *ilock_shared, bool *extend) + bool *ilock_shared, bool *extend, + bool *unwritten) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, * in file_modified(). */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_overwrite_io(inode, offset, count))) { + !ext4_overwrite_io(inode, offset, count, unwritten))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unaligned_io = false; + bool extend = false, unaligned_io = false, unwritten = false; bool ilock_shared = true; /* @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + ret = ext4_dio_write_checks(iocb, from, + &ilock_shared, &extend, &unwritten); if (ret <= 0) return ret; @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_journal_stop(handle); } - if (ilock_shared) + if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 96517785a9f8..78658e2e0a7c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -1005,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, return ret; } -/* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write(). So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage(). In that case, we - * *know* that ext4_writepage() has generated enough buffer credits to do the - * whole page. So we won't block on the journal in that case, which is good, - * because the caller may be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. - */ int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1149,6 +1124,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } #endif +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) @@ -1649,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} - /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block @@ -1887,216 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int __ext4_journalled_writepage(struct page *page, - unsigned int len) +static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - handle_t *handle = NULL; - int ret = 0, err = 0; - int inline_data = ext4_has_inline_data(inode); - struct buffer_head *inode_bh = NULL; - loff_t size; - - ClearPageChecked(page); - - if (inline_data) { - BUG_ON(page->index != 0); - BUG_ON(len > ext4_get_max_inline_size(inode)); - inode_bh = ext4_journalled_write_inline_data(inode, len, page); - if (inode_bh == NULL) - goto out; - } - /* - * We need to release the page lock before we start the - * journal, so grab a reference so the page won't disappear - * out from under us. - */ - get_page(page); - unlock_page(page); - - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - put_page(page); - goto out_no_pagelock; - } - BUG_ON(!ext4_handle_valid(handle)); - - lock_page(page); - put_page(page); - size = i_size_read(inode); - if (page->mapping != mapping || page_offset(page) > size) { - /* The page got truncated from under us */ - ext4_journal_stop(handle); - ret = 0; - goto out; - } - - if (inline_data) { - ret = ext4_mark_inode_dirty(handle, inode); - } else { - struct buffer_head *page_bufs = page_buffers(page); - - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, do_journal_get_write_access); - - err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, write_end_fn); - } - if (ret == 0) - ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: + mpd->first_page++; unlock_page(page); -out_no_pagelock: - brelse(inode_bh); - return ret; -} - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - * - ext4_writepages after taking page lock (have journal handle) - * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via the kswapd/direct reclaim (no journal handle) - * - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int ret = 0; - loff_t size; - unsigned int len; - struct buffer_head *page_bufs = NULL; - struct inode *inode = page->mapping->host; - struct ext4_io_submit io_submit; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - folio_invalidate(folio, 0, folio_size(folio)); - folio_unlock(folio); - return -EIO; - } - - trace_ext4_writepage(page); - size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT && - !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - /* Should never happen but for bugs in other kernel subsystems */ - if (!page_has_buffers(page)) { - ext4_warning_inode(inode, - "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); - return 0; - } - - page_bufs = page_buffers(page); - /* - * We cannot do block allocation or other extent handling in this - * function. If there are buffers needing that, we have to redirty - * the page. But we may reach here when we do a journal commit via - * journal_submit_inode_data_buffers() and in that case we must write - * allocated buffers to achieve data=ordered mode guarantees. - * - * Also, if there is only one buffer per page (the fs block - * size == the page size), if one buffer needs block - * allocation or needs to modify the extent tree to clear the - * unwritten flag, we know that the page can't be written at - * all, so we might as well refuse the write immediately. - * Unfortunately if the block size != page size, we can't as - * easily detect this case using ext4_walk_page_buffers(), but - * for the extremely common case, this is an optimization that - * skips a useless round trip through ext4_bio_write_page(). - */ - if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - if ((current->flags & PF_MEMALLOC) || - (inode->i_sb->s_blocksize == PAGE_SIZE)) { - /* - * For memory cleaning there's no point in writing only - * some buffers. So just bail out. Warn if we came here - * from direct reclaim. - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) - == PF_MEMALLOC); - unlock_page(page); - return 0; - } - } - - if (PageChecked(page) && ext4_should_journal_data(inode)) - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - return __ext4_journalled_writepage(page, len); - - ext4_io_submit_init(&io_submit, wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - ret = ext4_bio_write_page(&io_submit, page, len); - ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); - return ret; } static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) @@ -2129,7 +1899,6 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) err = ext4_bio_write_page(&mpd->io_submit, page, len); if (!err) mpd->wbc->nr_to_write--; - mpd->first_page++; return err; } @@ -2243,6 +2012,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, err = mpage_submit_page(mpd, head->b_page); if (err < 0) return err; + mpage_page_done(mpd, head->b_page); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2374,6 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) err = mpage_submit_page(mpd, page); if (err < 0) goto out; + mpage_page_done(mpd, page); } folio_batch_release(&fbatch); } @@ -2572,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page) return false; } +static int ext4_journal_page_buffers(handle_t *handle, struct page *page, + int len) +{ + struct buffer_head *page_bufs = page_buffers(page); + struct inode *inode = page->mapping->host; + int ret, err; + + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + + return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, + struct mpage_da_data *mpd, + struct page *page) +{ + struct inode *inode = mpd->inode; + loff_t size = i_size_read(inode); + int len; + + ClearPageChecked(page); + clear_page_dirty_for_io(page); + mpd->wbc->nr_to_write--; + + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + + return ext4_journal_page_buffers(handle, page, len); +} + /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages @@ -2597,7 +2412,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct pagevec pvec; unsigned int nr_pages; - long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; @@ -2605,12 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; + handle_t *handle = NULL; + int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + if (ext4_should_journal_data(mpd->inode)) { + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, + bpp); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } pagevec_init(&pvec); mpd->map.m_len = 0; mpd->next_page = index; @@ -2631,13 +2453,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ - if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->wbc->nr_to_write <= + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != page->index) goto out; + if (handle) { + err = ext4_journal_ensure_credits(handle, bpp, + 0); + if (err < 0) + goto out; + } + lock_page(page); /* * If the page is no longer dirty, or its mapping no @@ -2677,18 +2508,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->first_page = page->index; mpd->next_page = page->index + 1; /* - * Writeout for transaction commit where we cannot - * modify metadata is simple. Just submit the page. + * Writeout when we cannot modify metadata is simple. + * Just submit the page. For data=journal mode we + * first handle writeout of the page for checkpoint and + * only after that handle delayed page dirtying. This + * is crutial so that forcing a transaction commit and + * then calling filemap_write_and_wait() guarantees + * current state of data is in its final location. Such + * sequence is used for example by insert/collapse + * range operations before discarding the page cache. */ if (!mpd->can_map) { if (ext4_page_nomap_can_writeout(page)) { err = mpage_submit_page(mpd, page); if (err < 0) goto out; - } else { - unlock_page(page); - mpd->first_page++; } + /* Pending dirtying of journalled data? */ + if (PageChecked(page)) { + err = mpage_journal_page_buffers(handle, + mpd, page); + if (err < 0) + goto out; + } + mpage_page_done(mpd, page); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)page->index) << @@ -2700,24 +2543,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; err = 0; } - left--; } pagevec_release(&pvec); cond_resched(); } mpd->scanned_until_end = 1; + if (handle) + ext4_journal_stop(handle); return 0; out: pagevec_release(&pvec); + if (handle) + ext4_journal_stop(handle); return err; } -static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, - void *data) -{ - return ext4_writepage(page, wbc); -} - static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; @@ -2743,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; - if (ext4_should_journal_data(inode)) { - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); - blk_finish_plug(&plug); - goto out_writepages; - } - /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that @@ -2784,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) ext4_journal_stop(handle); } + /* + * data=journal mode does not do delalloc so we just need to writeout / + * journal already mapped buffers + */ + if (ext4_should_journal_data(inode)) + mpd->can_map = 0; + if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in @@ -3160,9 +3000,8 @@ static int ext4_da_write_end(struct file *file, * i_disksize since writeback will push i_disksize upto i_size * eventually. If the end of the current write is > i_size and * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as neither - * ext4_writepage() nor certain ext4_writepages() paths not - * allocating blocks update i_disksize. + * check), we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks update i_disksize. * * Note that we defer inode dirtying to generic_write_end() / * ext4_da_write_inline_data_end(). @@ -3687,24 +3526,26 @@ const struct iomap_ops ext4_iomap_report_ops = { }; /* - * Whenever the folio is being dirtied, corresponding buffers should already - * be attached to the transaction (we take care of this in ext4_page_mkwrite() - * and ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->dirty_folio is called under VFS locks and the folio - * is not necessarily locked. - * - * We cannot just dirty the folio and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the folio "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering. We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive". We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode. So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); - folio_set_checked(folio); + if (folio_maybe_dma_pinned(folio)) + folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } @@ -4877,13 +4718,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, goto bad_inode; raw_inode = ext4_raw_inode(&iloc); - if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - ext4_error_inode(inode, function, line, 0, - "iget: root inode unallocated"); - ret = -EFSCORRUPTED; - goto bad_inode; - } - if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; @@ -4956,11 +4790,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if ((inode->i_mode == 0 || + if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { - /* this inode is deleted */ - ret = -ESTALE; + /* this inode is deleted or unallocated */ + if (flags & EXT4_IGET_SPECIAL) { + ext4_error_inode(inode, function, line, 0, + "iget: special inode unallocated"); + ret = -EFSCORRUPTED; + } else + ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have @@ -5387,7 +5226,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty folio without + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. @@ -5793,7 +5632,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; - int ret = 0; + int ret; /* * How many index blocks need to touch to map @lblocks logical blocks @@ -6325,18 +6164,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - do_journal_get_write_access)) - goto out_error; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - write_end_fn)) - goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, - page_offset(page), len)) + if (ext4_journal_page_buffers(handle, page, len)) goto out_error; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { unlock_page(page); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8c2b1ff5e695..cc17205f7f49 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb, set_buffer_uptodate(bh); unlock_buffer(bh); - if (err) - goto out_bh; - if (handle) { err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 604ee458f31d..7cc3918e2f18 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3884,8 +3884,10 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto end_rename; } retval = ext4_rename_dir_prepare(handle, &old); - if (retval) + if (retval) { + inode_unlock(old.inode); goto end_rename; + } } /* * If we're renaming a file within an inline_data dir and adding or diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1e4db96a04e6..8703fd732abb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -502,7 +502,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* Nothing to submit? Just unlock the page... */ if (!nr_to_submit) - goto unlock; + return 0; bh = head = page_buffers(page); @@ -550,7 +550,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } bh = bh->b_this_page; } while (bh != head); - goto unlock; + + return ret; } } @@ -565,7 +566,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, continue; io_submit_add_bh(io, inode, page, bounce_page, bh); } while ((bh = bh->b_this_page) != head); -unlock: - unlock_page(page); - return ret; + + return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e79ca9ef9831..a7495f4ef3e1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4751,7 +4751,6 @@ static int ext4_group_desc_init(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; - int ret; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / @@ -4791,8 +4790,7 @@ static int ext4_group_desc_init(struct super_block *sb, ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; - ret = PTR_ERR(bh); - goto out; + return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; @@ -4801,13 +4799,10 @@ static int ext4_group_desc_init(struct super_block *sb, sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - ret = -EFSCORRUPTED; - goto out; + return -EFSCORRUPTED; } + return 0; -out: - ext4_group_desc_free(sbi); - return ret; } static int ext4_load_and_init_journal(struct super_block *sb, @@ -5234,14 +5229,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_geometry_check(sb, es)) goto failed_mount; - err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); - if (err) - goto failed_mount; - timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) + goto failed_mount3; + /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) goto failed_mount3; @@ -5747,6 +5742,28 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, return journal_inode; } +static int ext4_journal_bmap(journal_t *journal, sector_t *block) +{ + struct ext4_map_blocks map; + int ret; + + if (journal->j_inode == NULL) + return 0; + + map.m_lblk = *block; + map.m_len = 1; + ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); + if (ret <= 0) { + ext4_msg(journal->j_inode->i_sb, KERN_CRIT, + "journal bmap failed: block %llu ret %d\n", + *block, ret); + jbd2_journal_abort(journal, ret ? ret : -EIO); + return ret; + } + *block = map.m_pblk; + return 0; +} + static journal_t *ext4_get_journal(struct super_block *sb, unsigned int journal_inum) { @@ -5767,6 +5784,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, return NULL; } journal->j_private = sb; + journal->j_bmap = ext4_journal_bmap; ext4_init_journal_params(sb, journal); return journal; } @@ -5941,6 +5959,7 @@ static int ext4_load_journal(struct super_block *sb, err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); @@ -5949,6 +5968,14 @@ static int ext4_load_journal(struct super_block *sb, memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); kfree(save); + es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & + EXT4_ERROR_FS); + /* Write out restored error information to the superblock */ + if (!bdev_read_only(sb->s_bdev)) { + int err2; + err2 = ext4_commit_super(sb); + err = err ? : err2; + } } if (err) { @@ -6178,11 +6205,13 @@ static int ext4_clear_journal_err(struct super_block *sb, errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); - ext4_warning(sb, "Marking fs in need of filesystem check."); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb); + j_errno = ext4_commit_super(sb); + if (j_errno) + return j_errno; + ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index e2b8b3437c58..12d6252e3e22 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -501,13 +501,13 @@ static const struct sysfs_ops ext4_attr_ops = { .store = ext4_attr_store, }; -static struct kobj_type ext4_sb_ktype = { +static const struct kobj_type ext4_sb_ktype = { .default_groups = ext4_groups, .sysfs_ops = &ext4_attr_ops, .release = ext4_sb_release, }; -static struct kobj_type ext4_feat_ktype = { +static const struct kobj_type ext4_feat_ktype = { .default_groups = ext4_feat_groups, .sysfs_ops = &ext4_attr_ops, .release = ext4_feat_release, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f66c3fae9058..767454d74cd6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index) } static int -ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, - void *value_start) +check_xattrs(struct inode *inode, struct buffer_head *bh, + struct ext4_xattr_entry *entry, void *end, void *value_start, + const char *function, unsigned int line) { struct ext4_xattr_entry *e = entry; + int err = -EFSCORRUPTED; + char *err_str; + + if (bh) { + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + err_str = "invalid header"; + goto errout; + } + if (buffer_verified(bh)) + return 0; + if (!ext4_xattr_block_csum_verify(inode, bh)) { + err = -EFSBADCRC; + err_str = "invalid checksum"; + goto errout; + } + } else { + struct ext4_xattr_ibody_header *header = value_start; + + header -= 1; + if (end - (void *)header < sizeof(*header) + sizeof(u32)) { + err_str = "in-inode xattr block too small"; + goto errout; + } + if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + err_str = "bad magic number in in-inode xattr"; + goto errout; + } + } /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); - if ((void *)next >= end) - return -EFSCORRUPTED; - if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) - return -EFSCORRUPTED; + if ((void *)next >= end) { + err_str = "e_name out of bounds"; + goto errout; + } + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { + err_str = "bad e_name length"; + goto errout; + } e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); + unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); - if (size > EXT4_XATTR_SIZE_MAX) - return -EFSCORRUPTED; + if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { + err_str = "ea_inode specified without ea_inode feature enabled"; + goto errout; + } + if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || + !ext4_valid_inum(inode->i_sb, ea_ino))) { + err_str = "invalid ea_ino"; + goto errout; + } + if (size > EXT4_XATTR_SIZE_MAX) { + err_str = "e_value size too large"; + goto errout; + } if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); @@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ - if (offs > end - value_start) - return -EFSCORRUPTED; + if (offs > end - value_start) { + err_str = "e_value out of bounds"; + goto errout; + } value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || - EXT4_XATTR_SIZE(size) > end - value) - return -EFSCORRUPTED; + EXT4_XATTR_SIZE(size) > end - value) { + err_str = "overlapping e_value "; + goto errout; + } } entry = EXT4_XATTR_NEXT(entry); } - + if (bh) + set_buffer_verified(bh); return 0; + +errout: + if (bh) + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted xattr block %llu: %s", + (unsigned long long) bh->b_blocknr, + err_str); + else + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted in-inode xattr: %s", err_str); + return err; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { - int error = -EFSCORRUPTED; - - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) - goto errout; - if (buffer_verified(bh)) - return 0; - - error = -EFSBADCRC; - if (!ext4_xattr_block_csum_verify(inode, bh)) - goto errout; - error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, - bh->b_data); -errout: - if (error) - __ext4_error_inode(inode, function, line, 0, -error, - "corrupted xattr block %llu", - (unsigned long long) bh->b_blocknr); - else - set_buffer_verified(bh); - return error; + return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data, function, line); } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static int +static inline int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - int error = -EFSCORRUPTED; - - if (end - (void *)header < sizeof(*header) + sizeof(u32) || - (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) - goto errout; - error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); -errout: - if (error) - __ext4_error_inode(inode, function, line, 0, -error, - "corrupted in-inode xattr"); - return error; + return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), + function, line); } #define xattr_check_inode(inode, header, end) \ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e7537fd305dd..e191ecfb1fde 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -956,26 +956,40 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, goto out; } -static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len) +static struct folio * +gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) { + struct inode *inode = iter->inode; unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocks; + struct folio *folio; + int status; blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; - return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + if (status) + return ERR_PTR(status); + + folio = iomap_get_folio(iter, pos); + if (IS_ERR(folio)) + gfs2_trans_end(sdp); + return folio; } -static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page) +static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, + unsigned copied, struct folio *folio) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - if (page && !gfs2_is_stuffed(ip)) - gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); + if (!gfs2_is_stuffed(ip)) + gfs2_page_add_databufs(ip, &folio->page, offset_in_page(pos), + copied); + + folio_unlock(folio); + folio_put(folio); if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, gfs2_trans_end(sdp); } -static const struct iomap_page_ops gfs2_iomap_page_ops = { - .page_prepare = gfs2_iomap_page_prepare, - .page_done = gfs2_iomap_page_done, +static const struct iomap_folio_ops gfs2_iomap_folio_ops = { + .get_folio = gfs2_iomap_get_folio, + .put_folio = gfs2_iomap_put_folio, }; static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, @@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) - iomap->page_ops = &gfs2_iomap_page_ops; + iomap->folio_ops = &gfs2_iomap_folio_ops; return 0; out_trans_end: @@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, /* * NOTE: Never call gfs2_block_zero_range with an open transaction because it * uses iomap write to perform its actions, which begin their own transactions - * (iomap_begin, page_prepare, etc.) + * (iomap_begin, get_folio, etc.) */ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 356193e44cf0..d3c300563eb8 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); +/** + * iomap_get_folio - get a folio reference for writing + * @iter: iteration structure + * @pos: start offset of write + * + * Returns a locked reference to the folio at @pos, or an error pointer if the + * folio could not be obtained. + */ +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +{ + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; + struct folio *folio; + + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + fgp, mapping_gfp_mask(iter->inode->i_mapping)); + if (folio) + return folio; + + if (iter->flags & IOMAP_NOWAIT) + return ERR_PTR(-EAGAIN); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(iomap_get_folio); + bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) { trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), @@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; } +static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, + size_t len) +{ + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + + if (folio_ops && folio_ops->get_folio) + return folio_ops->get_folio(iter, pos, len); + else + return iomap_get_folio(iter, pos); +} + +static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, + struct folio *folio) +{ + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + + if (folio_ops && folio_ops->put_folio) { + folio_ops->put_folio(iter->inode, pos, ret, folio); + } else { + folio_unlock(folio); + folio_put(folio); + } +} + static int iomap_write_begin_inline(const struct iomap_iter *iter, struct folio *folio) { @@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { - const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); struct folio *folio; - unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; - if (iter->flags & IOMAP_NOWAIT) - fgp |= FGP_NOWAIT; - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(iter->inode, pos, len); - if (status) - return status; - } - - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, - fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (!folio) { - status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; - goto out_no_page; - } + folio = __iomap_get_folio(iter, pos, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * Now we have a locked folio, before we do anything with it we need to @@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, * could do the wrong thing here (zero a page range incorrectly or fail * to zero) and corrupt data. */ - if (page_ops && page_ops->iomap_valid) { - bool iomap_valid = page_ops->iomap_valid(iter->inode, - &iter->iomap); + if (folio_ops && folio_ops->iomap_valid) { + bool iomap_valid = folio_ops->iomap_valid(iter->inode, + &iter->iomap); if (!iomap_valid) { iter->iomap.flags |= IOMAP_F_STALE; status = 0; @@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, return 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + __iomap_put_folio(iter, pos, 0, folio); iomap_write_failed(iter->inode, pos, len); -out_no_page: - if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, 0, NULL); return status; } @@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { - const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t old_size = iter->inode->i_size; size_t ret; @@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, i_size_write(iter->inode, pos + ret); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - folio_unlock(folio); + __iomap_put_folio(iter, pos, ret, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); - if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, ret, &folio->page); - folio_put(folio); - if (ret < len) iomap_write_failed(iter->inode, pos + ret, len - ret); return ret; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9804714b1751..f771001574d0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - if (!(dio->flags & IOMAP_DIO_WRITE)) { - WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + if (!(dio->flags & IOMAP_DIO_WRITE)) return REQ_OP_READ; - } - - if (iomap->flags & IOMAP_F_ZONE_APPEND) - opflags |= REQ_OP_ZONE_APPEND; - else - opflags |= REQ_OP_WRITE; + opflags |= REQ_OP_WRITE; if (use_fua) opflags |= REQ_FUA; else diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2696f43e7239..c84f588fdcd0 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -970,10 +970,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, { int err = 0; unsigned long long ret; - sector_t block = 0; + sector_t block = blocknr; - if (journal->j_inode) { - block = blocknr; + if (journal->j_bmap) { + err = journal->j_bmap(journal, &block); + if (err == 0) + *retp = block; + } else if (journal->j_inode) { ret = bmap(journal->j_inode, &block); if (ret || !block) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 989cf341779b..f8ff81c3de76 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2472,20 +2472,20 @@ xfs_defer_agfl_block( struct xfs_owner_info *oinfo) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *new; /* new element */ + struct xfs_extent_free_item *xefi; ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); - new = kmem_cache_zalloc(xfs_extfree_item_cache, + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); - new->xefi_blockcount = 1; - new->xefi_owner = oinfo->oi_owner; + xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + xefi->xefi_blockcount = 1; + xefi->xefi_owner = oinfo->oi_owner; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); } /* @@ -2500,7 +2500,7 @@ __xfs_free_extent_later( const struct xfs_owner_info *oinfo, bool skip_discard) { - struct xfs_extent_free_item *new; /* new element */ + struct xfs_extent_free_item *xefi; #ifdef DEBUG struct xfs_mount *mp = tp->t_mountp; xfs_agnumber_t agno; @@ -2519,27 +2519,27 @@ __xfs_free_extent_later( #endif ASSERT(xfs_extfree_item_cache != NULL); - new = kmem_cache_zalloc(xfs_extfree_item_cache, + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = bno; - new->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_startblock = bno; + xefi->xefi_blockcount = (xfs_extlen_t)len; if (skip_discard) - new->xefi_flags |= XFS_EFI_SKIP_DISCARD; + xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { ASSERT(oinfo->oi_offset == 0); if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) - new->xefi_flags |= XFS_EFI_ATTR_FORK; + xefi->xefi_flags |= XFS_EFI_ATTR_FORK; if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) - new->xefi_flags |= XFS_EFI_BMBT_BLOCK; - new->xefi_owner = oinfo->oi_owner; + xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; + xefi->xefi_owner = oinfo->oi_owner; } else { - new->xefi_owner = XFS_RMAP_OWN_NULL; + xefi->xefi_owner = XFS_RMAP_OWN_NULL; } trace_xfs_bmap_free_defer(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); } #ifdef DEBUG diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0d56a8d862e8..c8c65387136c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent( int xfs_bmap_finish_one( struct xfs_trans *tp, - struct xfs_inode *ip, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) + struct xfs_bmap_intent *bi) { + struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; ASSERT(tp->t_firstblock == NULLFSBLOCK); trace_xfs_bmap_deferred(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, - XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - ip->i_ino, whichfork, startoff, *blockcount, state); + XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), + bi->bi_type, + XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), + bi->bi_owner->i_ino, bi->bi_whichfork, + bmap->br_startoff, bmap->br_blockcount, + bmap->br_state); - if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) + if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; - switch (type) { + switch (bi->bi_type) { case XFS_BMAP_MAP: - error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, - startblock, 0); - *blockcount = 0; + error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, + bmap->br_blockcount, bmap->br_startblock, 0); + bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: - error = __xfs_bunmapi(tp, ip, startoff, blockcount, - XFS_BMAPI_REMAP, 1); + error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, + &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 16db95b11589..01c2df35c3e3 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -234,10 +234,7 @@ struct xfs_bmap_intent { struct xfs_bmbt_irec bi_bmap; }; -int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, - enum xfs_bmap_intent_type type, int whichfork, - xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, xfs_exntst_t state); +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 35f574421670..da8c769887fd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2913,9 +2913,22 @@ xfs_btree_split_worker( } /* - * BMBT split requests often come in with little stack to work on. Push + * BMBT split requests often come in with little stack to work on so we push * them off to a worker thread so there is lots of stack to use. For the other * btree types, just call directly to avoid the context switch overhead here. + * + * Care must be taken here - the work queue rescuer thread introduces potential + * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new + * AGFs to allocate blocks. A task being run by the rescuer could attempt to + * lock an AGF that is already locked by a task queued to run by the rescuer, + * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to + * release it until the current thread it is running gains the lock. + * + * To avoid this issue, we only ever queue BMBT splits that don't have an AGF + * already locked to allocate from. The only place that doesn't hold an AGF + * locked is unwritten extent conversion at IO completion, but that has already + * been offloaded to a worker thread and hence has no stack consumption issues + * we have to worry about. */ STATIC int /* error */ xfs_btree_split( @@ -2929,7 +2942,8 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); - if (cur->bc_btnum != XFS_BTNUM_BMAP) + if (cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_tp->t_firstblock == NULLFSBLOCK) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); args.cur = cur; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6f7ed9288fe4..bcf46aa0d08b 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1213,37 +1213,33 @@ xfs_refcount_adjust_extents( STATIC int xfs_refcount_adjust( struct xfs_btree_cur *cur, - xfs_agblock_t agbno, - xfs_extlen_t aglen, - xfs_agblock_t *new_agbno, - xfs_extlen_t *new_aglen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, enum xfs_refc_adjust_op adj) { bool shape_changed; int shape_changes = 0; int error; - *new_agbno = agbno; - *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_increase(cur->bc_mp, + cur->bc_ag.pag->pag_agno, *agbno, *aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_decrease(cur->bc_mp, + cur->bc_ag.pag->pag_agno, *agbno, *aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. */ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, - agbno, &shape_changed); + *agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, - agbno + aglen, &shape_changed); + *agbno + *aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1253,7 +1249,7 @@ xfs_refcount_adjust( * Try to merge with the left or right extents of the range. */ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, - new_agbno, new_aglen, adj, &shape_changed); + agbno, aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1262,7 +1258,7 @@ xfs_refcount_adjust( cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); + error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); if (error) goto out_error; @@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup( static inline int xfs_refcount_continue_op( struct xfs_btree_cur *cur, - xfs_fsblock_t startblock, - xfs_agblock_t new_agbno, - xfs_extlen_t new_len, - xfs_fsblock_t *new_fsbno) + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, + ri->ri_blockcount))) return -EFSCORRUPTED; - *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); - ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1327,11 +1322,7 @@ xfs_refcount_continue_op( int xfs_refcount_finish_one( struct xfs_trans *tp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; @@ -1339,17 +1330,16 @@ xfs_refcount_finish_one( struct xfs_buf *agbp = NULL; int error = 0; xfs_agblock_t bno; - xfs_agblock_t new_agbno; unsigned long nr_ops = 0; int shape_changes = 0; struct xfs_perag *pag; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); - bno = XFS_FSB_TO_AGBNO(mp, startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), - type, XFS_FSB_TO_AGBNO(mp, startblock), - blockcount); + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), + ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), + ri->ri_blockcount); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { error = -EIO; @@ -1380,42 +1370,42 @@ xfs_refcount_finish_one( } *pcur = rcur; - switch (type) { + switch (ri->ri_type) { case XFS_REFCOUNT_INCREASE: - error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_INCREASE); + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); if (error) goto out_drop; - if (*new_len > 0) - error = xfs_refcount_continue_op(rcur, startblock, - new_agbno, *new_len, new_fsb); + if (ri->ri_blockcount > 0) + error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_DECREASE: - error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_DECREASE); + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); if (error) goto out_drop; - if (*new_len > 0) - error = xfs_refcount_continue_op(rcur, startblock, - new_agbno, *new_len, new_fsb); + if (ri->ri_blockcount > 0) + error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_ALLOC_COW: - *new_fsb = startblock + blockcount; - *new_len = 0; - error = __xfs_refcount_cow_alloc(rcur, bno, blockcount); + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + goto out_drop; + ri->ri_blockcount = 0; break; case XFS_REFCOUNT_FREE_COW: - *new_fsb = startblock + blockcount; - *new_len = 0; - error = __xfs_refcount_cow_free(rcur, bno, blockcount); + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + goto out_drop; + ri->ri_blockcount = 0; break; default: ASSERT(0); error = -EFSCORRUPTED; } - if (!error && *new_len > 0) - trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, - bno, blockcount, new_agbno, *new_len); + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, + ri->ri_type, bno, ri->ri_blockcount); out_drop: xfs_perag_put(pag); return error; diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 452f30556f5a..c633477ce3ce 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp, extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); extern int xfs_refcount_finish_one(struct xfs_trans *tp, - enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, - xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index b56aca1e7c66..df720041cd3d 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup( int xfs_rmap_finish_one( struct xfs_trans *tp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, + struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; @@ -2408,11 +2402,13 @@ xfs_rmap_finish_one( xfs_agblock_t bno; bool unwritten; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); - bno = XFS_FSB_TO_AGBNO(mp, startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); + bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, - startoff, blockcount, state); + trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, + ri->ri_owner, ri->ri_whichfork, + ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, + ri->ri_bmap.br_state); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { error = -EIO; @@ -2448,35 +2444,37 @@ xfs_rmap_finish_one( } *pcur = rcur; - xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); - unwritten = state == XFS_EXT_UNWRITTEN; - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); + xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, + ri->ri_bmap.br_startoff); + unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; + bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - switch (type) { + switch (ri->ri_type) { case XFS_RMAP_ALLOC: case XFS_RMAP_MAP: - error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); + error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, + unwritten, &oinfo); break; case XFS_RMAP_MAP_SHARED: - error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_map_shared(rcur, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: - error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, + unwritten, &oinfo); break; case XFS_RMAP_UNMAP_SHARED: - error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_unmap_shared(rcur, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); break; case XFS_RMAP_CONVERT: - error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, - &oinfo); + error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, + !unwritten, &oinfo); break; case XFS_RMAP_CONVERT_SHARED: - error = xfs_rmap_convert_shared(rcur, bno, blockcount, - !unwritten, &oinfo); + error = xfs_rmap_convert_shared(rcur, bno, + ri->ri_bmap.br_blockcount, !unwritten, &oinfo); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 54741a591a17..2dac88cea28d 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); -int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, - uint64_t owner, int whichfork, xfs_fileoff_t startoff, - xfs_fsblock_t startblock, xfs_filblks_t blockcount, - xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 41734202796f..2ef78aa1d3f6 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -449,15 +449,17 @@ xfs_prepare_ioend( } /* - * If the page has delalloc blocks on it, we need to punch them out before we - * invalidate the page. If we don't, we leave a stale delalloc mapping on the - * inode that can trip up a later direct I/O read operation on the same region. + * If the folio has delalloc blocks on it, the caller is asking us to punch them + * out. If we don't, we can leave a stale delalloc mapping covered by a clean + * page that needs to be dirtied again before the delalloc mapping can be + * converted. This stale delalloc mapping can trip up a later direct I/O read + * operation on the same region. * - * We prevent this by truncating away the delalloc regions on the page. Because + * We prevent this by truncating away the delalloc regions on the folio. Because * they are delalloc, we can do this without needing a transaction. Indeed - if * we get ENOSPC errors, we have to be able to do this truncation without a - * transaction as there is no space left for block reservation (typically why we - * see a ENOSPC in writeback). + * transaction as there is no space left for block reservation (typically why + * we see a ENOSPC in writeback). */ static void xfs_discard_folio( @@ -475,8 +477,13 @@ xfs_discard_folio( "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", folio, ip->i_ino, pos); + /* + * The end of the punch range is always the offset of the the first + * byte of the next folio. Hence the end offset is only dependent on the + * folio itself and not the start offset that is passed in. + */ error = xfs_bmap_punch_delalloc_range(ip, pos, - round_up(pos, folio_size(folio))); + folio_pos(folio) + folio_size(folio)); if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 41323da523d1..6e2f0013380a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -246,18 +246,11 @@ static int xfs_trans_log_finish_bmap_update( struct xfs_trans *tp, struct xfs_bud_log_item *budp, - enum xfs_bmap_intent_type type, - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) + struct xfs_bmap_intent *bi) { int error; - error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, - startblock, blockcount, state); + error = xfs_bmap_finish_one(tp, bi); /* * Mark the transaction dirty, even on error. This ensures the @@ -290,24 +283,24 @@ xfs_bmap_update_diff_items( /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( - struct xfs_map_extent *bmap, + struct xfs_map_extent *map, enum xfs_bmap_intent_type type, int whichfork, xfs_exntst_t state) { - bmap->me_flags = 0; + map->me_flags = 0; switch (type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - bmap->me_flags = type; + map->me_flags = type; break; default: ASSERT(0); } if (state == XFS_EXT_UNWRITTEN) - bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; if (whichfork == XFS_ATTR_FORK) - bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; + map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; } /* Log bmap updates in the intent item. */ @@ -315,7 +308,7 @@ STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, struct xfs_bui_log_item *buip, - struct xfs_bmap_intent *bmap) + struct xfs_bmap_intent *bi) { uint next_extent; struct xfs_map_extent *map; @@ -331,12 +324,12 @@ xfs_bmap_update_log_item( next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; ASSERT(next_extent < buip->bui_format.bui_nextents); map = &buip->bui_format.bui_extents[next_extent]; - map->me_owner = bmap->bi_owner->i_ino; - map->me_startblock = bmap->bi_bmap.br_startblock; - map->me_startoff = bmap->bi_bmap.br_startoff; - map->me_len = bmap->bi_bmap.br_blockcount; - xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, - bmap->bi_bmap.br_state); + map->me_owner = bi->bi_owner->i_ino; + map->me_startblock = bi->bi_bmap.br_startblock; + map->me_startoff = bi->bi_bmap.br_startoff; + map->me_len = bi->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, + bi->bi_bmap.br_state); } static struct xfs_log_item * @@ -348,15 +341,15 @@ xfs_bmap_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_bui_log_item *buip = xfs_bui_init(mp); - struct xfs_bmap_intent *bmap; + struct xfs_bmap_intent *bi; ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); - list_for_each_entry(bmap, items, bi_list) - xfs_bmap_update_log_item(tp, buip, bmap); + list_for_each_entry(bi, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bi); return &buip->bui_item; } @@ -378,25 +371,17 @@ xfs_bmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_bmap_intent *bmap; - xfs_filblks_t count; + struct xfs_bmap_intent *bi; int error; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), - bmap->bi_type, - bmap->bi_owner, bmap->bi_whichfork, - bmap->bi_bmap.br_startoff, - bmap->bi_bmap.br_startblock, - &count, - bmap->bi_bmap.br_state); - if (!error && count > 0) { - ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); - bmap->bi_bmap.br_blockcount = count; + bi = container_of(item, struct xfs_bmap_intent, bi_list); + + error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + if (!error && bi->bi_bmap.br_blockcount > 0) { + ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } - kmem_cache_free(xfs_bmap_intent_cache, bmap); + kmem_cache_free(xfs_bmap_intent_cache, bi); return error; } @@ -413,10 +398,10 @@ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) { - struct xfs_bmap_intent *bmap; + struct xfs_bmap_intent *bi; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - kmem_cache_free(xfs_bmap_intent_cache, bmap); + bi = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_cache_free(xfs_bmap_intent_cache, bi); } const struct xfs_defer_op_type xfs_bmap_update_defer_type = { @@ -434,18 +419,18 @@ xfs_bui_validate( struct xfs_mount *mp, struct xfs_bui_log_item *buip) { - struct xfs_map_extent *bmap; + struct xfs_map_extent *map; /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) return false; - bmap = &buip->bui_format.bui_extents[0]; + map = &buip->bui_format.bui_extents[0]; - if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS) + if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS) return false; - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: break; @@ -453,13 +438,13 @@ xfs_bui_validate( return false; } - if (!xfs_verify_ino(mp, bmap->me_owner)) + if (!xfs_verify_ino(mp, map->me_owner)) return false; - if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len)) + if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; - return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } /* @@ -471,17 +456,13 @@ xfs_bui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { - struct xfs_bmbt_irec irec; + struct xfs_bmap_intent fake = { }; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_map_extent *bmap; + struct xfs_map_extent *map; struct xfs_bud_log_item *budp; - xfs_filblks_t count; - xfs_exntst_t state; - unsigned int bui_type; - int whichfork; int iext_delta; int error = 0; @@ -491,14 +472,12 @@ xfs_bui_item_recover( return -EFSCORRUPTED; } - bmap = &buip->bui_format.bui_extents[0]; - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + map = &buip->bui_format.bui_extents[0]; + fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - error = xlog_recover_iget(mp, bmap->me_owner, &ip); + error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) return error; @@ -512,34 +491,34 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (bui_type == XFS_BMAP_MAP) + if (fake.bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, - whichfork, bmap->me_startoff, bmap->me_startblock, - &count, state); + fake.bi_owner = ip; + fake.bi_bmap.br_startblock = map->me_startblock; + fake.bi_bmap.br_startoff = map->me_startoff; + fake.bi_bmap.br_blockcount = map->me_len; + fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, - sizeof(*bmap)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, + sizeof(*map)); if (error) goto err_cancel; - if (count > 0) { - ASSERT(bui_type == XFS_BMAP_UNMAP); - irec.br_startblock = bmap->me_startblock; - irec.br_blockcount = count; - irec.br_startoff = bmap->me_startoff; - irec.br_state = state; - xfs_bmap_unmap_extent(tp, ip, &irec); + if (fake.bi_bmap.br_blockcount > 0) { + ASSERT(fake.bi_type == XFS_BMAP_UNMAP); + xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); } /* @@ -579,18 +558,18 @@ xfs_bui_item_relog( { struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; - struct xfs_map_extent *extp; + struct xfs_map_extent *map; unsigned int count; count = BUI_ITEM(intent)->bui_format.bui_nextents; - extp = BUI_ITEM(intent)->bui_format.bui_extents; + map = BUI_ITEM(intent)->bui_format.bui_extents; tp->t_flags |= XFS_TRANS_DIRTY; budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); buip = xfs_bui_init(tp->t_mountp); - memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); xfs_trans_add_item(tp, &buip->bui_item); set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ae082808cfed..b2cbbba3e15a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_errortag); -static struct kobj_type xfs_errortag_ktype = { +static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, .default_groups = xfs_errortag_groups, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index dbe6c37dc697..0b9c5ba8a598 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); /* * XFS panic tags -- allow a call to xfs_alert_tag() be turned into - * a panic by setting xfs_panic_mask in a sysctl. + * a panic by setting fs.xfs.panic_mask in a sysctl. */ #define XFS_NO_PTAG 0u #define XFS_PTAG_IFLUSH (1u << 0) @@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_FSBLOCK_ZERO (1u << 7) #define XFS_PTAG_VERIFIER_ERROR (1u << 8) +#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \ + XFS_PTAG_LOGRES | \ + XFS_PTAG_AILDELETE | \ + XFS_PTAG_ERROR_REPORT | \ + XFS_PTAG_SHUTDOWN_CORRUPT | \ + XFS_PTAG_SHUTDOWN_IOERROR | \ + XFS_PTAG_SHUTDOWN_LOGERROR | \ + XFS_PTAG_FSBLOCK_ZERO | \ + XFS_PTAG_VERIFIER_ERROR) + #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ { XFS_PTAG_IFLUSH, "iflush" }, \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d5130d1fcfae..011b50469301 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -345,23 +345,30 @@ static int xfs_trans_free_extent( struct xfs_trans *tp, struct xfs_efd_log_item *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len, - const struct xfs_owner_info *oinfo, - bool skip_discard) + struct xfs_extent_free_item *xefi) { + struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, + xefi->xefi_startblock); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - start_block); + xefi->xefi_startblock); int error; - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, + xefi->xefi_blockcount); - error = __xfs_free_extent(tp, start_block, ext_len, - oinfo, XFS_AG_RESV_NONE, skip_discard); + error = __xfs_free_extent(tp, xefi->xefi_startblock, + xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -375,8 +382,8 @@ xfs_trans_free_extent( next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = start_block; - extp->ext_len = ext_len; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; return error; @@ -404,7 +411,7 @@ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, struct xfs_efi_log_item *efip, - struct xfs_extent_free_item *free) + struct xfs_extent_free_item *xefi) { uint next_extent; struct xfs_extent *extp; @@ -420,8 +427,8 @@ xfs_extent_free_log_item( next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; ASSERT(next_extent < efip->efi_format.efi_nextents); extp = &efip->efi_format.efi_extents[next_extent]; - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; } static struct xfs_log_item * @@ -433,15 +440,15 @@ xfs_extent_free_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; ASSERT(count > 0); xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); - list_for_each_entry(free, items, xefi_list) - xfs_extent_free_log_item(tp, efip, free); + list_for_each_entry(xefi, items, xefi_list) + xfs_extent_free_log_item(tp, efip, xefi); return &efip->efi_item; } @@ -463,21 +470,13 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_owner_info oinfo = { }; - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; int error; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - oinfo.oi_owner = free->xefi_owner; - if (free->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (free->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - error = xfs_trans_free_extent(tp, EFD_ITEM(done), - free->xefi_startblock, - free->xefi_blockcount, - &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD); - kmem_cache_free(xfs_extfree_item_cache, free); + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + + error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -494,10 +493,10 @@ STATIC void xfs_extent_free_cancel_item( struct list_head *item) { - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - kmem_cache_free(xfs_extfree_item_cache, free); + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + kmem_cache_free(xfs_extfree_item_cache, xefi); } const struct xfs_defer_op_type xfs_extent_free_defer_type = { @@ -523,7 +522,7 @@ xfs_agfl_free_finish_item( struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_efd_log_item *efdp = EFD_ITEM(done); - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; struct xfs_extent *extp; struct xfs_buf *agbp; int error; @@ -532,13 +531,13 @@ xfs_agfl_free_finish_item( uint next_extent; struct xfs_perag *pag; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - ASSERT(free->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); - oinfo.oi_owner = free->xefi_owner; + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(xefi->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); + oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); @@ -559,11 +558,11 @@ xfs_agfl_free_finish_item( next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; - kmem_cache_free(xfs_extfree_item_cache, free); + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -599,7 +598,6 @@ xfs_efi_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; - struct xfs_extent *extp; int i; int error = 0; @@ -624,10 +622,17 @@ xfs_efi_item_recover( efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { + struct xfs_extent_free_item fake = { + .xefi_owner = XFS_RMAP_OWN_UNKNOWN, + }; + struct xfs_extent *extp; + extp = &efip->efi_format.efi_extents[i]; - error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, - &XFS_RMAP_OINFO_ANY_OWNER, false); + + fake.xefi_startblock = extp->ext_start; + fake.xefi_blockcount = extp->ext_len; + + error = xfs_trans_free_extent(tp, efdp, &fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 88a88506ffff..92ca2017eded 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt( { struct xfs_alloc_rec_incore akeys[2]; + memset(akeys, 0, sizeof(akeys)); info->missing_owner = XFS_FMR_OWN_UNKNOWN; return __xfs_getfsmap_datadev(tp, keys, info, xfs_getfsmap_datadev_bnobt_query, &akeys[0]); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d0a98f920ca..9edc1f2bc939 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -4,6 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_error.h" /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, @@ -15,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 256 }, + .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fc1946f80a4a..69dbe7814128 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -83,7 +83,7 @@ xfs_iomap_valid( return true; } -static const struct iomap_page_ops xfs_iomap_page_ops = { +static const struct iomap_folio_ops xfs_iomap_folio_ops = { .iomap_valid = xfs_iomap_valid, }; @@ -133,7 +133,7 @@ xfs_bmbt_to_iomap( iomap->flags |= IOMAP_F_DIRTY; iomap->validity_cookie = sequence_cookie; - iomap->page_ops = &xfs_iomap_page_ops; + iomap->folio_ops = &xfs_iomap_folio_ops; return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e2c542f6dcd4..78ca52e55f03 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1321,15 +1321,14 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) { - /* - * The inode walk may have partially populated the dquot - * caches. We must purge them before disabling quota and - * tearing down the quotainfo, or else the dquots will leak. - */ - xfs_qm_dqpurge_all(mp); - goto error_return; - } + + /* + * On error, the inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and tearing down + * the quotainfo, or else the dquots will leak. + */ + if (error) + goto error_purge; /* * We've made all the changes that we need to make incore. Flush them @@ -1363,10 +1362,8 @@ xfs_qm_quotacheck( * and turn quotaoff. The dquots won't be attached to any of the inodes * at this point (because we intentionally didn't in dqget_noattach). */ - if (error) { - xfs_qm_dqpurge_all(mp); - goto error_return; - } + if (error) + goto error_purge; /* * If one type of quotas is off, then it will lose its @@ -1376,7 +1373,7 @@ xfs_qm_quotacheck( mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; mp->m_qflags |= flags; - error_return: +error_return: xfs_buf_delwri_cancel(&buffer_list); if (error) { @@ -1395,6 +1392,21 @@ xfs_qm_quotacheck( } else xfs_notice(mp, "Quotacheck: Done."); return error; + +error_purge: + /* + * On error, we may have inodes queued for inactivation. This may try + * to attach dquots to the inode before running cleanup operations on + * the inode and this can race with the xfs_qm_destroy_quotainfo() call + * below that frees mp->m_quotainfo. To avoid this race, flush all the + * pending inodegc operations before we purge the dquots from memory, + * ensuring that background inactivation is idle whilst we turn off + * quotas. + */ + xfs_inodegc_flush(mp); + xfs_qm_dqpurge_all(mp); + goto error_return; + } /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 858e3e9eb4a8..48d771a76add 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -252,17 +252,12 @@ static int xfs_trans_log_finish_refcount_update( struct xfs_trans *tp, struct xfs_cud_log_item *cudp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { int error; - error = xfs_refcount_finish_one(tp, type, startblock, - blockcount, new_fsb, new_len, pcur); + error = xfs_refcount_finish_one(tp, ri, pcur); /* * Mark the transaction dirty, even on error. This ensures the @@ -297,16 +292,16 @@ xfs_refcount_update_diff_items( /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( - struct xfs_phys_extent *refc, + struct xfs_phys_extent *pmap, enum xfs_refcount_intent_type type) { - refc->pe_flags = 0; + pmap->pe_flags = 0; switch (type) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: case XFS_REFCOUNT_FREE_COW: - refc->pe_flags |= type; + pmap->pe_flags |= type; break; default: ASSERT(0); @@ -318,10 +313,10 @@ STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, struct xfs_cui_log_item *cuip, - struct xfs_refcount_intent *refc) + struct xfs_refcount_intent *ri) { uint next_extent; - struct xfs_phys_extent *ext; + struct xfs_phys_extent *pmap; tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -333,10 +328,10 @@ xfs_refcount_update_log_item( */ next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; ASSERT(next_extent < cuip->cui_format.cui_nextents); - ext = &cuip->cui_format.cui_extents[next_extent]; - ext->pe_startblock = refc->ri_startblock; - ext->pe_len = refc->ri_blockcount; - xfs_trans_set_refcount_flags(ext, refc->ri_type); + pmap = &cuip->cui_format.cui_extents[next_extent]; + pmap->pe_startblock = ri->ri_startblock; + pmap->pe_len = ri->ri_blockcount; + xfs_trans_set_refcount_flags(pmap, ri->ri_type); } static struct xfs_log_item * @@ -348,15 +343,15 @@ xfs_refcount_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); - struct xfs_refcount_intent *refc; + struct xfs_refcount_intent *ri; ASSERT(count > 0); xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); - list_for_each_entry(refc, items, ri_list) - xfs_refcount_update_log_item(tp, cuip, refc); + list_for_each_entry(ri, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, ri); return &cuip->cui_item; } @@ -378,25 +373,20 @@ xfs_refcount_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_refcount_intent *refc; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_aglen; + struct xfs_refcount_intent *ri; int error; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), - refc->ri_type, refc->ri_startblock, refc->ri_blockcount, - &new_fsb, &new_aglen, state); + ri = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, + state); /* Did we run out of reservation? Requeue what we didn't finish. */ - if (!error && new_aglen > 0) { - ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || - refc->ri_type == XFS_REFCOUNT_DECREASE); - refc->ri_startblock = new_fsb; - refc->ri_blockcount = new_aglen; + if (!error && ri->ri_blockcount > 0) { + ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || + ri->ri_type == XFS_REFCOUNT_DECREASE); return -EAGAIN; } - kmem_cache_free(xfs_refcount_intent_cache, refc); + kmem_cache_free(xfs_refcount_intent_cache, ri); return error; } @@ -413,10 +403,10 @@ STATIC void xfs_refcount_update_cancel_item( struct list_head *item) { - struct xfs_refcount_intent *refc; + struct xfs_refcount_intent *ri; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - kmem_cache_free(xfs_refcount_intent_cache, refc); + ri = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_cache_free(xfs_refcount_intent_cache, ri); } const struct xfs_defer_op_type xfs_refcount_update_defer_type = { @@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { static inline bool xfs_cui_validate_phys( struct xfs_mount *mp, - struct xfs_phys_extent *refc) + struct xfs_phys_extent *pmap) { if (!xfs_has_reflink(mp)) return false; - if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) + if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) return false; - switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: @@ -451,7 +441,7 @@ xfs_cui_validate_phys( return false; } - return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len); + return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } /* @@ -463,18 +453,13 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { - struct xfs_bmbt_irec irec; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_phys_extent *refc; struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_len; unsigned int refc_type; bool requeue_only = false; - enum xfs_refcount_intent_type type; int i; int error = 0; @@ -513,14 +498,17 @@ xfs_cui_item_recover( cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - refc = &cuip->cui_format.cui_extents[i]; - refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + struct xfs_refcount_intent fake = { }; + struct xfs_phys_extent *pmap; + + pmap = &cuip->cui_format.cui_extents[i]; + refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; switch (refc_type) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: case XFS_REFCOUNT_FREE_COW: - type = refc_type; + fake.ri_type = refc_type; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -529,13 +517,12 @@ xfs_cui_item_recover( error = -EFSCORRUPTED; goto abort_error; } - if (requeue_only) { - new_fsb = refc->pe_startblock; - new_len = refc->pe_len; - } else + + fake.ri_startblock = pmap->pe_startblock; + fake.ri_blockcount = pmap->pe_len; + if (!requeue_only) error = xfs_trans_log_finish_refcount_update(tp, cudp, - type, refc->pe_startblock, refc->pe_len, - &new_fsb, &new_len, &rcur); + &fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, @@ -544,10 +531,13 @@ xfs_cui_item_recover( goto abort_error; /* Requeue what we didn't finish. */ - if (new_len > 0) { - irec.br_startblock = new_fsb; - irec.br_blockcount = new_len; - switch (type) { + if (fake.ri_blockcount > 0) { + struct xfs_bmbt_irec irec = { + .br_startblock = fake.ri_startblock, + .br_blockcount = fake.ri_blockcount, + }; + + switch (fake.ri_type) { case XFS_REFCOUNT_INCREASE: xfs_refcount_increase_extent(tp, &irec); break; @@ -596,18 +586,18 @@ xfs_cui_item_relog( { struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; - struct xfs_phys_extent *extp; + struct xfs_phys_extent *pmap; unsigned int count; count = CUI_ITEM(intent)->cui_format.cui_nextents; - extp = CUI_ITEM(intent)->cui_format.cui_extents; + pmap = CUI_ITEM(intent)->cui_format.cui_extents; tp->t_flags |= XFS_TRANS_DIRTY; cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); cuip = xfs_cui_init(tp->t_mountp, count); - memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); xfs_trans_add_item(tp, &cuip->cui_item); set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 534504ede1a3..a1619d67015f 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -244,40 +244,40 @@ xfs_trans_get_rud( /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( - struct xfs_map_extent *rmap, + struct xfs_map_extent *map, enum xfs_rmap_intent_type type, int whichfork, xfs_exntst_t state) { - rmap->me_flags = 0; + map->me_flags = 0; if (state == XFS_EXT_UNWRITTEN) - rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; if (whichfork == XFS_ATTR_FORK) - rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; switch (type) { case XFS_RMAP_MAP: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP; + map->me_flags |= XFS_RMAP_EXTENT_MAP; break; case XFS_RMAP_MAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; break; case XFS_RMAP_UNMAP: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; + map->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; case XFS_RMAP_UNMAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; break; case XFS_RMAP_CONVERT: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; + map->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; case XFS_RMAP_CONVERT_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; break; case XFS_RMAP_ALLOC: - rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; + map->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; case XFS_RMAP_FREE: - rmap->me_flags |= XFS_RMAP_EXTENT_FREE; + map->me_flags |= XFS_RMAP_EXTENT_FREE; break; default: ASSERT(0); @@ -293,19 +293,12 @@ static int xfs_trans_log_finish_rmap_update( struct xfs_trans *tp, struct xfs_rud_log_item *rudp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, + struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { int error; - error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, - startblock, blockcount, state, pcur); + error = xfs_rmap_finish_one(tp, ri, pcur); /* * Mark the transaction dirty, even on error. This ensures the @@ -342,7 +335,7 @@ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, struct xfs_rui_log_item *ruip, - struct xfs_rmap_intent *rmap) + struct xfs_rmap_intent *ri) { uint next_extent; struct xfs_map_extent *map; @@ -358,12 +351,12 @@ xfs_rmap_update_log_item( next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; ASSERT(next_extent < ruip->rui_format.rui_nextents); map = &ruip->rui_format.rui_extents[next_extent]; - map->me_owner = rmap->ri_owner; - map->me_startblock = rmap->ri_bmap.br_startblock; - map->me_startoff = rmap->ri_bmap.br_startoff; - map->me_len = rmap->ri_bmap.br_blockcount; - xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, - rmap->ri_bmap.br_state); + map->me_owner = ri->ri_owner; + map->me_startblock = ri->ri_bmap.br_startblock; + map->me_startoff = ri->ri_bmap.br_startoff; + map->me_len = ri->ri_bmap.br_blockcount; + xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork, + ri->ri_bmap.br_state); } static struct xfs_log_item * @@ -375,15 +368,15 @@ xfs_rmap_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; ASSERT(count > 0); xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); - list_for_each_entry(rmap, items, ri_list) - xfs_rmap_update_log_item(tp, ruip, rmap); + list_for_each_entry(ri, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, ri); return &ruip->rui_item; } @@ -405,16 +398,14 @@ xfs_rmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; int error; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), - rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, + ri = container_of(item, struct xfs_rmap_intent, ri_list); + + error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, state); - kmem_cache_free(xfs_rmap_intent_cache, rmap); + kmem_cache_free(xfs_rmap_intent_cache, ri); return error; } @@ -431,10 +422,10 @@ STATIC void xfs_rmap_update_cancel_item( struct list_head *item) { - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - kmem_cache_free(xfs_rmap_intent_cache, rmap); + ri = container_of(item, struct xfs_rmap_intent, ri_list); + kmem_cache_free(xfs_rmap_intent_cache, ri); } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { @@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { static inline bool xfs_rui_validate_map( struct xfs_mount *mp, - struct xfs_map_extent *rmap) + struct xfs_map_extent *map) { if (!xfs_has_rmapbt(mp)) return false; - if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) + if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS) return false; - switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: @@ -473,14 +464,14 @@ xfs_rui_validate_map( return false; } - if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) && - !xfs_verify_ino(mp, rmap->me_owner)) + if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) && + !xfs_verify_ino(mp, map->me_owner)) return false; - if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len)) + if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; - return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } /* @@ -493,15 +484,11 @@ xfs_rui_item_recover( struct list_head *capture_list) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_map_extent *rmap; struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - enum xfs_rmap_intent_type type; - xfs_exntst_t state; int i; - int whichfork; int error = 0; /* @@ -526,35 +513,34 @@ xfs_rui_item_recover( rudp = xfs_trans_get_rud(tp, ruip); for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - rmap = &ruip->rui_format.rui_extents[i]; - state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + struct xfs_rmap_intent fake = { }; + struct xfs_map_extent *map; + + map = &ruip->rui_format.rui_extents[i]; + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: - type = XFS_RMAP_MAP; + fake.ri_type = XFS_RMAP_MAP; break; case XFS_RMAP_EXTENT_MAP_SHARED: - type = XFS_RMAP_MAP_SHARED; + fake.ri_type = XFS_RMAP_MAP_SHARED; break; case XFS_RMAP_EXTENT_UNMAP: - type = XFS_RMAP_UNMAP; + fake.ri_type = XFS_RMAP_UNMAP; break; case XFS_RMAP_EXTENT_UNMAP_SHARED: - type = XFS_RMAP_UNMAP_SHARED; + fake.ri_type = XFS_RMAP_UNMAP_SHARED; break; case XFS_RMAP_EXTENT_CONVERT: - type = XFS_RMAP_CONVERT; + fake.ri_type = XFS_RMAP_CONVERT; break; case XFS_RMAP_EXTENT_CONVERT_SHARED: - type = XFS_RMAP_CONVERT_SHARED; + fake.ri_type = XFS_RMAP_CONVERT_SHARED; break; case XFS_RMAP_EXTENT_ALLOC: - type = XFS_RMAP_ALLOC; + fake.ri_type = XFS_RMAP_ALLOC; break; case XFS_RMAP_EXTENT_FREE: - type = XFS_RMAP_FREE; + fake.ri_type = XFS_RMAP_FREE; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -563,13 +549,21 @@ xfs_rui_item_recover( error = -EFSCORRUPTED; goto abort_error; } - error = xfs_trans_log_finish_rmap_update(tp, rudp, type, - rmap->me_owner, whichfork, - rmap->me_startoff, rmap->me_startblock, - rmap->me_len, state, &rcur); + + fake.ri_owner = map->me_owner; + fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + fake.ri_bmap.br_startblock = map->me_startblock; + fake.ri_bmap.br_startoff = map->me_startoff; + fake.ri_bmap.br_blockcount = map->me_len; + fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, + &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - rmap, sizeof(*rmap)); + map, sizeof(*map)); if (error) goto abort_error; @@ -600,18 +594,18 @@ xfs_rui_item_relog( { struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; - struct xfs_map_extent *extp; + struct xfs_map_extent *map; unsigned int count; count = RUI_ITEM(intent)->rui_format.rui_nextents; - extp = RUI_ITEM(intent)->rui_format.rui_extents; + map = RUI_ITEM(intent)->rui_format.rui_extents; tp->t_flags |= XFS_TRANS_DIRTY; rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); ruip = xfs_rui_init(tp->t_mountp, count); - memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); xfs_trans_add_item(tp, &ruip->rui_item); set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index f7faf6e70d7f..a3c6b1548723 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -struct kobj_type xfs_mp_ktype = { +const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_dbg); -struct kobj_type xfs_dbg_ktype = { +const struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_dbg_groups, @@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_stats); -struct kobj_type xfs_stats_ktype = { +const struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_stats_groups, @@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_log); -struct kobj_type xfs_log_ktype = { +const struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_log_groups, @@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_error); -static struct kobj_type xfs_error_cfg_ktype = { +static const struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_error_groups, }; -static struct kobj_type xfs_error_ktype = { +static const struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 513095e353a5..148893ebfdef 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,10 +7,10 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ -extern struct kobj_type xfs_dbg_ktype; /* debug */ -extern struct kobj_type xfs_log_ktype; /* xlog */ -extern struct kobj_type xfs_stats_ktype; /* stats */ +extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern const struct kobj_type xfs_dbg_ktype; /* debug */ +extern const struct kobj_type xfs_log_ktype; /* xlog */ +extern const struct kobj_type xfs_stats_ktype; /* stats */ static inline struct xfs_kobj * to_kobj(struct kobject *kobject) @@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject) static inline int xfs_sysfs_init( struct xfs_kobj *kobj, - struct kobj_type *ktype, + const struct kobj_type *ktype, struct xfs_kobj *parent_kobj, const char *name) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 421d1e504ac4..6b0e9ae7c513 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); TRACE_EVENT(xfs_refcount_finish_one_leftover, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int type, xfs_agblock_t agbno, xfs_extlen_t len, - xfs_agblock_t new_agbno, xfs_extlen_t new_len), - TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(int, type) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_agblock_t, new_agbno) - __field(xfs_extlen_t, new_len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; @@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, __entry->type = type; __entry->agbno = agbno; __entry->len = len; - __entry->new_agbno = new_agbno; - __entry->new_len = new_len; ), - TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", + TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, __entry->agbno, - __entry->len, - __entry->new_agbno, - __entry->new_len) + __entry->len) ); /* simple inode-based error/%ip tracepoint class */ diff --git a/include/linux/bio.h b/include/linux/bio.h index c1da63f6c808..d766be7152e1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -12,6 +12,8 @@ #define BIO_MAX_VECS 256U +struct queue_limits; + static inline unsigned int bio_max_segs(unsigned int nr_segs) { return min(nr_segs, BIO_MAX_VECS); @@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, struct bio_set *bs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0983dfc9a203..0f8123504e5e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -13,6 +13,7 @@ struct address_space; struct fiemap_extent_info; struct inode; +struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; struct iov_iter; @@ -58,8 +59,7 @@ struct vm_fault; #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #define IOMAP_F_BUFFER_HEAD (1U << 4) -#define IOMAP_F_ZONE_APPEND (1U << 5) -#define IOMAP_F_XATTR (1U << 6) +#define IOMAP_F_XATTR (1U << 5) /* * Flags set by the core iomap code during operations: @@ -85,7 +85,7 @@ struct vm_fault; */ #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ -struct iomap_page_ops; +struct iomap_folio_ops; struct iomap { u64 addr; /* disk offset of mapping, bytes */ @@ -97,7 +97,7 @@ struct iomap { struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; void *private; /* filesystem private */ - const struct iomap_page_ops *page_ops; + const struct iomap_folio_ops *folio_ops; u64 validity_cookie; /* used with .iomap_valid() */ }; @@ -125,19 +125,20 @@ static inline bool iomap_inline_data_valid(const struct iomap *iomap) } /* - * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare - * and page_done will be called for each page written to. This only applies to - * buffered writes as unbuffered writes will not typically have pages + * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio + * and put_folio will be called for each folio written to. This only applies + * to buffered writes as unbuffered writes will not typically have folios * associated with them. * - * When page_prepare succeeds, page_done will always be called to do any - * cleanup work necessary. In that page_done call, @page will be NULL if the - * associated page could not be obtained. + * When get_folio succeeds, put_folio will always be called to do any + * cleanup work necessary. put_folio is responsible for unlocking and putting + * @folio. */ -struct iomap_page_ops { - int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); - void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, - struct page *page); +struct iomap_folio_ops { + struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, + unsigned len); + void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, + struct folio *folio); /* * Check that the cached iomap still maps correctly to the filesystem's @@ -260,6 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e8fa79f5bb34..c4c55e841547 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1308,6 +1308,14 @@ struct journal_s struct buffer_head *bh, enum passtype pass, int off, tid_t expected_commit_id); + + /** + * @j_bmap: + * + * Bmap function that should be used instead of the generic + * VFS bmap function. + */ + int (*j_bmap)(struct journal_s *journal, sector_t *block); }; #define jbd2_might_wait_for_commit(j) \ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 6548b5b5aa60..75d7d22c3a27 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -32,6 +32,7 @@ struct prelim_ref; struct btrfs_space_info; struct btrfs_raid_bio; struct raid56_bio_trace_info; +struct find_free_extent_ctl; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_root *root, u64 num_bytes, - u64 empty_size, u64 data), + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(root, num_bytes, empty_size, data), + TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) - __field( u64, data ) + __field( u64, flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; - __entry->num_bytes = num_bytes; - __entry->empty_size = empty_size; - __entry->data = data; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", show_root_type(__entry->root_objectid), - __entry->num_bytes, __entry->empty_size, __entry->data, - __print_flags((unsigned long)__entry->data, "|", + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS)) +); + +TRACE_EVENT(find_free_extent_search_loop, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), + + TP_ARGS(root, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop) +); + +TRACE_EVENT(find_free_extent_have_block_group, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *block_group), + + TP_ARGS(root, ffe_ctl, block_group), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + __field( bool, hinted ) + __field( u64, bg_start ) + __field( u64, bg_flags ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->bg_start = block_group->start; + __entry->bg_flags = block_group->flags; + ), + + TP_printk_btrfs( +"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop, __entry->hinted, + __entry->bg_start, __entry->bg_flags, + __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs__reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len), + TP_ARGS(block_group, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) + __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) + __field( u64, loop ) + __field( bool, hinted ) + __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; - __entry->start = start; - __entry->len = len; + __entry->bg_size_class = block_group->size_class; + __entry->start = ffe_ctl->search_start; + __entry->len = ffe_ctl->num_bytes; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->size_class = ffe_ctl->size_class; ), - TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " - "start=%llu len=%llu", + TP_printk_btrfs( +"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len) + __entry->bg_size_class, __entry->start, __entry->len, + __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); TRACE_EVENT(btrfs_find_cluster, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 77b426ae0064..ebccf6a6aa1b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op, (unsigned long) __entry->index) ); -DEFINE_EVENT(ext4__page_op, ext4_writepage, - - TP_PROTO(struct page *page), - - TP_ARGS(page) -); - DEFINE_EVENT(ext4__page_op, ext4_readpage, TP_PROTO(struct page *page), diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b4f0f9531119..ada0a489bf2b 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -245,7 +245,17 @@ struct btrfs_ioctl_dev_info_args { __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ __u64 bytes_used; /* out */ __u64 total_bytes; /* out */ - __u64 unused[379]; /* pad to 4k */ + /* + * Optional, out. + * + * Showing the fsid of the device, allowing user space to check if this + * device is a seeding one. + * + * Introduced in v6.3, thus user space still needs to check if kernel + * changed this value. Older kernel will not touch the values here. + */ + __u8 fsid[BTRFS_UUID_SIZE]; + __u64 unused[377]; /* pad to 4k */ __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ }; -- 2.40.0 From 498e79383c15c846059b24391fce696d83ee9f83 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 10 Mar 2023 18:05:48 +0100 Subject: [PATCH 08/15] Implement amd-pstate-epp and amd-pstate-guided driver Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 33 +- Documentation/admin-guide/pm/amd-pstate.rst | 95 ++- drivers/acpi/cppc_acpi.c | 188 ++++- drivers/cpufreq/amd-pstate.c | 794 +++++++++++++++++- drivers/cpufreq/brcmstb-avs-cpufreq.c | 5 +- drivers/cpufreq/cpufreq.c | 8 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 4 +- drivers/cpufreq/omap-cpufreq.c | 4 +- drivers/cpufreq/qcom-cpufreq-hw.c | 4 +- include/acpi/cppc_acpi.h | 23 + include/linux/amd-pstate.h | 34 + include/linux/cpufreq.h | 2 +- 12 files changed, 1136 insertions(+), 58 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 54239257209c..c8cf7e37713d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -339,6 +339,29 @@ This mode requires kvm-amd.avic=1. (Default when IOMMU HW support is present.) + amd_pstate= [X86] + disable + Do not enable amd_pstate as the default + scaling driver for the supported processors + passive + Use amd_pstate with passive mode as a scaling driver. + In this mode autonomous selection is disabled. + Driver requests a desired performance level and platform + tries to match the same performance level if it is + satisfied by guaranteed performance level. + active + Use amd_pstate_epp driver instance as the scaling driver, + driver provides a hint to the hardware if software wants + to bias toward performance (0x0) or energy efficiency (0xff) + to the CPPC firmware. then CPPC power algorithm will + calculate the runtime workload and adjust the realtime cores + frequency. + guided + Activate guided autonomous mode. Driver requests minimum and + maximum performance level and the platform autonomously + selects a performance level in this range and appropriate + to the current workload. + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , @@ -7019,13 +7042,3 @@ xmon commands. off xmon is disabled. - amd_pstate= [X86] - disable - Do not enable amd_pstate as the default - scaling driver for the supported processors - passive - Use amd_pstate as a scaling driver, driver requests a - desired performance on this abstract scale and the power - management firmware translates the requests into actual - hardware states (core frequency, data fabric and memory - clocks etc.) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 5376d53faaa8..f24a90007e98 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the request from AMD P-States. -User Space Interface in ``sysfs`` -================================== +User Space Interface in ``sysfs`` - Per-policy control +====================================================== ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to control its functionality at the system level. They are located in the @@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability `_.) This attribute is read-only. +``energy_performance_available_preferences`` + +A list of all the supported EPP preferences that could be used for +``energy_performance_preference`` on this system. +These profiles represent different hints that are provided +to the low-level firmware about the user's desired energy vs efficiency +tradeoff. ``default`` represents the epp value is set by platform +firmware. This attribute is read-only. + +``energy_performance_preference`` + +The current energy performance preference can be read from this attribute. +and user can change current preference according to energy or performance needs +Please get all support profiles list from +``energy_performance_available_preferences`` attribute, all the profiles are +integer values defined between 0 to 255 when EPP feature is enabled by platform +firmware, if EPP feature is disabled, driver will ignore the written value +This attribute is read-write. + Other performance and frequency values can be read back from ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. @@ -280,8 +299,35 @@ module which supports the new AMD P-States mechanism on most of the future AMD platforms. The AMD P-States mechanism is the more performance and energy efficiency frequency management method on AMD processors. -Kernel Module Options for ``amd-pstate`` -========================================= + +AMD Pstate Driver Operation Modes +================================= + +``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, +non-autonomous (passive) mode and guided autonomous (guided) mode. +Active/passive/guided mode can be chosen by different kernel parameters. + +- In autonomous mode, platform ignores the desired performance level request + and takes into account only the values set to the minimum, maximum and energy + performance preference registers. +- In non-autonomous mode, platform gets desired performance level + from OS directly through Desired Performance Register. +- In guided-autonomous mode, platform sets operating performance level + autonomously according to the current workload and within the limits set by + OS through min and max performance registers. + +Active Mode +------------ + +``amd_pstate=active`` + +This is the low-level firmware control mode which is implemented by ``amd_pstate_epp`` +driver with ``amd_pstate=active`` passed to the kernel in the command line. +In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software +wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware. +then CPPC power algorithm will calculate the runtime workload and adjust the realtime +cores frequency according to the power supply and thermal, core voltage and some other +hardware conditions. Passive Mode ------------ @@ -297,6 +343,47 @@ to the Performance Reduction Tolerance register. Above the nominal performance l processor must provide at least nominal performance requested and go higher if current operating conditions allow. +Guided Mode +----------- + +``amd_pstate=guided`` + +If ``amd_pstate=guided`` is passed to kernel command line option then this mode +is activated. In this mode, driver requests minimum and maximum performance +level and the platform autonomously selects a performance level in this range +and appropriate to the current workload. + +User Space Interface in ``sysfs`` - General +=========================================== + +Global Attributes +----------------- + +``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to +control its functionality at the system level. They are located in the +``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs. + +``status`` + Operation mode of the driver: "active", "passive" or "disable". + + "active" + The driver is functional and in the ``active mode`` + + "passive" + The driver is functional and in the ``passive mode`` + + "guided" + The driver is functional and in the ``guided mode`` + + "disable" + The driver is unregistered and not functional now. + + This attribute can be written to in order to change the driver's + operation mode or to unregister it. The string written to it must be + one of the possible values of it and, if successful, writing one of + these values to the sysfs file will cause the driver to switch over + to the operation mode represented by that string - or to be + unregistered in the "disable" case. ``cpupower`` tool support for ``amd-pstate`` =============================================== diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 0f17b1c32718..0efdbeed6ada 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1153,6 +1153,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); } +/** + * cppc_get_epp_perf - Get the epp register value. + * @cpunum: CPU from which to get epp preference value. + * @epp_perf: Return address. + * + * Return: 0 for success, -EIO otherwise. + */ +int cppc_get_epp_perf(int cpunum, u64 *epp_perf) +{ + return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf); +} +EXPORT_SYMBOL_GPL(cppc_get_epp_perf); + /** * cppc_get_perf_caps - Get a CPU's performance capabilities. * @cpunum: CPU from which to get capabilities info. @@ -1365,6 +1378,157 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) } EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs); +/* + * Set Energy Performance Preference Register value through + * Performance Controls Interface + */ +int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) +{ + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cpc_register_resource *epp_set_reg; + struct cpc_register_resource *auto_sel_reg; + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; + + if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); + return -ENODEV; + } + + if (CPC_SUPPORTED(auto_sel_reg)) { + ret = cpc_write(cpu, auto_sel_reg, enable); + if (ret) + return ret; + } + + if (CPC_SUPPORTED(epp_set_reg)) { + ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); + if (ret) + return ret; + } + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + /* after writing CPC, transfer the ownership of PCC to platform */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); + } else { + ret = -ENOTSUPP; + pr_debug("_CPC in PCC is not supported\n"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(cppc_set_epp_perf); + +/* + * cppc_get_auto_sel_caps - Read autonomous selection register. + * @cpunum : CPU from which to read register. + * @perf_caps : struct where autonomous selection register value is updated. + */ +int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) +{ + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); + struct cpc_register_resource *auto_sel_reg; + u64 auto_sel; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpunum); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + + if (!CPC_SUPPORTED(auto_sel_reg)) + pr_warn_once("Autonomous mode is not unsupported!\n"); + + if (CPC_IN_PCC(auto_sel_reg)) { + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = 0; + + if (pcc_ss_id < 0) + return -ENODEV; + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + + if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) { + cpc_read(cpunum, auto_sel_reg, &auto_sel); + perf_caps->auto_sel = (bool)auto_sel; + } else { + ret = -EIO; + } + + up_write(&pcc_ss_data->pcc_lock); + + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps); + +/* + * cppc_set_auto_sel - Write autonomous selection register. + * @cpu : CPU to which to write register. + * @enable : the desired value of autonomous selection resiter to be updated. + */ +int cppc_set_auto_sel(int cpu, bool enable) +{ + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cpc_register_resource *auto_sel_reg; + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = -EINVAL; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + + if (CPC_IN_PCC(auto_sel_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id\n"); + return -ENODEV; + } + + if (CPC_SUPPORTED(auto_sel_reg)) { + ret = cpc_write(cpu, auto_sel_reg, enable); + if (ret) + return ret; + } + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + /* after writing CPC, transfer the ownership of PCC to platform */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); + } else { + ret = -ENOTSUPP; + pr_debug("_CPC in PCC is not supported\n"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(cppc_set_auto_sel); + + /** * cppc_set_enable - Set to enable CPPC on the processor by writing the * Continuous Performance Control package EnableRegister field. @@ -1420,7 +1584,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable); int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); - struct cpc_register_resource *desired_reg; + struct cpc_register_resource *desired_reg, *min_perf_reg, *max_perf_reg; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cppc_pcc_data *pcc_ss_data = NULL; int ret = 0; @@ -1431,6 +1595,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) } desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; + min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF]; + max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF]; /* * This is Phase-I where we want to write to CPC registers @@ -1439,7 +1605,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) * Since read_lock can be acquired by multiple CPUs simultaneously we * achieve that goal here */ - if (CPC_IN_PCC(desired_reg)) { + if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) { if (pcc_ss_id < 0) { pr_debug("Invalid pcc_ss_id\n"); return -ENODEV; @@ -1462,13 +1628,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) cpc_desc->write_cmd_status = 0; } - /* - * Skip writing MIN/MAX until Linux knows how to come up with - * useful values. - */ cpc_write(cpu, desired_reg, perf_ctrls->desired_perf); - if (CPC_IN_PCC(desired_reg)) + /** + * Only write if min_perf and max_perf not zero. Some drivers pass zero + * value to min and max perf, but they don't mean to set the zero value, + * they just don't want to write to those registers. + */ + if (perf_ctrls->min_perf) + cpc_write(cpu, min_perf_reg, perf_ctrls->min_perf); + if (perf_ctrls->max_perf) + cpc_write(cpu, max_perf_reg, perf_ctrls->max_perf); + + if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) up_read(&pcc_ss_data->pcc_lock); /* END Phase-I */ /* * This is Phase-II where we transfer the ownership of PCC to Platform @@ -1516,7 +1688,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) * case during a CMD_READ and if there are pending writes it delivers * the write command before servicing the read command */ - if (CPC_IN_PCC(desired_reg)) { + if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) { if (down_write_trylock(&pcc_ss_data->pcc_lock)) {/* BEGIN Phase-II */ /* Update only if there are pending write commands */ if (pcc_ss_data->pending_pcc_write_cmd) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index c17bd845f5fc..f4f96baae500 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -59,8 +59,173 @@ * we disable it by default to go acpi-cpufreq on these processors and add a * module parameter to be able to enable it manually for debugging. */ +static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; -static int cppc_load __initdata; +static struct cpufreq_driver amd_pstate_epp_driver; +static int cppc_state = AMD_PSTATE_DISABLE; +struct kobject *amd_pstate_kobj; + +/* + * AMD Energy Preference Performance (EPP) + * The EPP is used in the CCLK DPM controller to drive + * the frequency that a core is going to operate during + * short periods of activity. EPP values will be utilized for + * different OS profiles (balanced, performance, power savings) + * display strings corresponding to EPP index in the + * energy_perf_strings[] + * index String + *------------------------------------- + * 0 default + * 1 performance + * 2 balance_performance + * 3 balance_power + * 4 power + */ +enum energy_perf_value_index { + EPP_INDEX_DEFAULT = 0, + EPP_INDEX_PERFORMANCE, + EPP_INDEX_BALANCE_PERFORMANCE, + EPP_INDEX_BALANCE_POWERSAVE, + EPP_INDEX_POWERSAVE, +}; + +static const char * const energy_perf_strings[] = { + [EPP_INDEX_DEFAULT] = "default", + [EPP_INDEX_PERFORMANCE] = "performance", + [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", + [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", + [EPP_INDEX_POWERSAVE] = "power", + NULL +}; + +static unsigned int epp_values[] = { + [EPP_INDEX_DEFAULT] = 0, + [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE, + [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, + [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, + [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, + }; + +typedef int (*cppc_mode_transition_fn)(int); + +static inline int get_mode_idx_from_str(const char *str, size_t size) +{ + int i; + + for (i=0; i < AMD_PSTATE_MAX; i++) { + if (!strncmp(str, amd_pstate_mode_string[i], size)) + return i; + } + return -EINVAL; +} + +static DEFINE_MUTEX(amd_pstate_limits_lock); +static DEFINE_MUTEX(amd_pstate_driver_lock); + +static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +{ + u64 epp; + int ret; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (!cppc_req_cached) { + epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, + &cppc_req_cached); + if (epp) + return epp; + } + epp = (cppc_req_cached >> 24) & 0xFF; + } else { + ret = cppc_get_epp_perf(cpudata->cpu, &epp); + if (ret < 0) { + pr_debug("Could not retrieve energy perf value (%d)\n", ret); + return -EIO; + } + } + + return (s16)(epp & 0xff); +} + +static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) +{ + s16 epp; + int index = -EINVAL; + + epp = amd_pstate_get_epp(cpudata, 0); + if (epp < 0) + return epp; + + switch (epp) { + case AMD_CPPC_EPP_PERFORMANCE: + index = EPP_INDEX_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_PERFORMANCE: + index = EPP_INDEX_BALANCE_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_POWERSAVE: + index = EPP_INDEX_BALANCE_POWERSAVE; + break; + case AMD_CPPC_EPP_POWERSAVE: + index = EPP_INDEX_POWERSAVE; + break; + default: + break; + } + + return index; +} + +static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) +{ + int ret; + struct cppc_perf_ctrls perf_ctrls; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + if (!ret) + cpudata->epp_cached = epp; + } else { + perf_ctrls.energy_perf = epp; + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + if (ret) { + pr_debug("failed to set energy perf value (%d)\n", ret); + return ret; + } + cpudata->epp_cached = epp; + } + + return ret; +} + +static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, + int pref_index) +{ + int epp = -EINVAL; + int ret; + + if (!pref_index) { + pr_debug("EPP pref_index is invalid\n"); + return -EINVAL; + } + + if (epp == -EINVAL) + epp = epp_values[pref_index]; + + if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { + pr_debug("EPP cannot be set under performance policy\n"); + return -EBUSY; + } + + ret = amd_pstate_set_epp(cpudata, epp); + + return ret; +} static inline int pstate_enable(bool enable) { @@ -70,11 +235,21 @@ static inline int pstate_enable(bool enable) static int cppc_enable(bool enable) { int cpu, ret = 0; + struct cppc_perf_ctrls perf_ctrls; for_each_present_cpu(cpu) { ret = cppc_set_enable(cpu, enable); if (ret) return ret; + + /* Enable autonomous mode for EPP */ + if (cppc_state == AMD_PSTATE_ACTIVE) { + /* Set desired perf as zero to allow EPP firmware control */ + perf_ctrls.desired_perf = 0; + ret = cppc_set_perf(cpu, &perf_ctrls); + if (ret) + return ret; + } } return ret; @@ -135,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) cppc_perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); - return 0; + if (cppc_state == AMD_PSTATE_ACTIVE) + return 0; + + ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf); + if (ret) { + pr_warn("failed to get auto_sel, ret: %d\n", ret); + return 0; + } + + ret = cppc_set_auto_sel(cpudata->cpu, + (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); + + if (ret) + pr_warn("failed to set auto_sel, ret: %d\n", ret); + + return ret; } DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); @@ -212,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) } static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, bool fast_switch) + u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) { u64 prev = READ_ONCE(cpudata->cppc_req_cached); u64 value = prev; des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + + if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { + min_perf = des_perf; + des_perf = 0; + } + value &= ~AMD_CPPC_MIN_PERF(~0L); value |= AMD_CPPC_MIN_PERF(min_perf); @@ -272,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy, cpufreq_freq_transition_begin(policy, &freqs); amd_pstate_update(cpudata, min_perf, des_perf, - max_perf, false); + max_perf, false, policy->governor->flags); cpufreq_freq_transition_end(policy, &freqs, false); return 0; @@ -306,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; - amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true); + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + policy->governor->flags); cpufreq_cpu_put(policy); } @@ -418,7 +615,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata) return; cpudata->boost_supported = true; - amd_pstate_driver.boost_enabled = true; + current_pstate_driver->boost_enabled = true; } static void amd_perf_ctl_reset(unsigned int cpu) @@ -501,6 +698,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; amd_pstate_boost_init(cpudata); + if (!current_pstate_driver->adjust_perf) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; return 0; @@ -561,7 +760,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, if (max_freq < 0) return max_freq; - return sprintf(&buf[0], "%u\n", max_freq); + return sysfs_emit(buf, "%u\n", max_freq); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, @@ -574,7 +773,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli if (freq < 0) return freq; - return sprintf(&buf[0], "%u\n", freq); + return sysfs_emit(buf, "%u\n", freq); } /* @@ -589,13 +788,208 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, perf = READ_ONCE(cpudata->highest_perf); - return sprintf(&buf[0], "%u\n", perf); + return sysfs_emit(buf, "%u\n", perf); +} + +static ssize_t show_energy_performance_available_preferences( + struct cpufreq_policy *policy, char *buf) +{ + int i = 0; + int offset = 0; + + while (energy_perf_strings[i] != NULL) + offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); + + sysfs_emit_at(buf, offset, "\n"); + + return offset; +} + +static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) +{ + struct amd_cpudata *cpudata = policy->driver_data; + char str_preference[21]; + ssize_t ret; + + ret = sscanf(buf, "%20s", str_preference); + if (ret != 1) + return -EINVAL; + + ret = match_string(energy_perf_strings, -1, str_preference); + if (ret < 0) + return -EINVAL; + + mutex_lock(&amd_pstate_limits_lock); + ret = amd_pstate_set_energy_pref_index(cpudata, ret); + mutex_unlock(&amd_pstate_limits_lock); + + return ret ?: count; +} + +static ssize_t show_energy_performance_preference( + struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int preference; + + preference = amd_pstate_get_energy_pref_index(cpudata); + if (preference < 0) + return preference; + + return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); +} + +static void amd_pstate_driver_cleanup(void) +{ + amd_pstate_enable(false); + cppc_state = AMD_PSTATE_DISABLE; + current_pstate_driver = NULL; +} + +static int amd_pstate_register_driver(int mode) +{ + int ret; + + if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED) + current_pstate_driver = &amd_pstate_driver; + else if (mode == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + else + return -EINVAL; + + cppc_state = mode; + ret = cpufreq_register_driver(current_pstate_driver); + if (ret) { + amd_pstate_driver_cleanup(); + return ret; + } + return 0; +} + +static int amd_pstate_unregister_driver(int dummy) +{ + cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_driver_cleanup(); + return 0; +} + +static int amd_pstate_change_mode_without_dvr_change(int mode) +{ + int cpu = 0; + + cppc_state = mode; + + if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) + return 0; + + for_each_present_cpu(cpu) { + cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); + } + + return 0; +} + +static int amd_pstate_change_driver_mode(int mode) +{ + int ret; + + ret = amd_pstate_unregister_driver(0); + if (ret) + return ret; + + ret = amd_pstate_register_driver(mode); + if (ret) + return ret; + + return 0; +} + +cppc_mode_transition_fn mode_state_machine[AMD_PSTATE_MAX][AMD_PSTATE_MAX] = { + [AMD_PSTATE_DISABLE] = { + [AMD_PSTATE_DISABLE] = NULL, + [AMD_PSTATE_PASSIVE] = amd_pstate_register_driver, + [AMD_PSTATE_ACTIVE] = amd_pstate_register_driver, + [AMD_PSTATE_GUIDED] = amd_pstate_register_driver, + }, + [AMD_PSTATE_PASSIVE] = { + [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, + [AMD_PSTATE_PASSIVE] = NULL, + [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode, + [AMD_PSTATE_GUIDED] = amd_pstate_change_mode_without_dvr_change, + }, + [AMD_PSTATE_ACTIVE] = { + [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, + [AMD_PSTATE_PASSIVE] = amd_pstate_change_driver_mode, + [AMD_PSTATE_ACTIVE] = NULL, + [AMD_PSTATE_GUIDED] = amd_pstate_change_driver_mode, + }, + [AMD_PSTATE_GUIDED] = { + [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, + [AMD_PSTATE_PASSIVE] = amd_pstate_change_mode_without_dvr_change, + [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode, + [AMD_PSTATE_GUIDED] = NULL, + }, +}; + +static ssize_t amd_pstate_show_status(char *buf) +{ + if (!current_pstate_driver) + return sysfs_emit(buf, "disable\n"); + + return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); +} + +static int amd_pstate_update_status(const char *buf, size_t size) +{ + int mode_idx; + + if (size > strlen("passive") || size < strlen("active")) + return -EINVAL; + + mode_idx = get_mode_idx_from_str(buf, size); + + if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) + return -EINVAL; + + if (mode_state_machine[cppc_state][mode_idx]) + return mode_state_machine[cppc_state][mode_idx](mode_idx); + + return 0; +} + +static ssize_t show_status(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t ret; + + mutex_lock(&amd_pstate_driver_lock); + ret = amd_pstate_show_status(buf); + mutex_unlock(&amd_pstate_driver_lock); + + return ret; +} + +static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, + const char *buf, size_t count) +{ + char *p = memchr(buf, '\n', count); + int ret; + + mutex_lock(&amd_pstate_driver_lock); + ret = amd_pstate_update_status(buf, p ? p - buf : count); + mutex_unlock(&amd_pstate_driver_lock); + + return ret < 0 ? ret : count; } cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_rw(energy_performance_preference); +cpufreq_freq_attr_ro(energy_performance_available_preferences); +define_one_global_rw(status); static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, @@ -604,6 +998,313 @@ static struct freq_attr *amd_pstate_attr[] = { NULL, }; +static struct freq_attr *amd_pstate_epp_attr[] = { + &amd_pstate_max_freq, + &amd_pstate_lowest_nonlinear_freq, + &amd_pstate_highest_perf, + &energy_performance_preference, + &energy_performance_available_preferences, + NULL, +}; + +static struct attribute *pstate_global_attributes[] = { + &status.attr, + NULL +}; + +static const struct attribute_group amd_pstate_global_attr_group = { + .attrs = pstate_global_attributes, +}; + +static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +{ + int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; + struct amd_cpudata *cpudata; + struct device *dev; + u64 value; + + /* + * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, + * which is ideal for initialization process. + */ + amd_perf_ctl_reset(policy->cpu); + dev = get_cpu_device(policy->cpu); + if (!dev) + return -ENODEV; + + cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL); + if (!cpudata) + return -ENOMEM; + + cpudata->cpu = policy->cpu; + cpudata->epp_policy = 0; + + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; + + min_freq = amd_get_min_freq(cpudata); + max_freq = amd_get_max_freq(cpudata); + nominal_freq = amd_get_nominal_freq(cpudata); + lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); + if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { + dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", + min_freq, max_freq); + ret = -EINVAL; + goto free_cpudata1; + } + + policy->cpuinfo.min_freq = min_freq; + policy->cpuinfo.max_freq = max_freq; + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + + /* Initial processor data capability frequencies */ + cpudata->max_freq = max_freq; + cpudata->min_freq = min_freq; + cpudata->nominal_freq = nominal_freq; + cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; + + policy->driver_data = cpudata; + + cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); + + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + + /* + * Set the policy to powersave to provide a valid fallback value in case + * the default cpufreq governor is neither powersave nor performance. + */ + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + policy->fast_switch_possible = true; + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + amd_pstate_boost_init(cpudata); + + return 0; + +free_cpudata1: + kfree(cpudata); + return ret; +} + +static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) +{ + pr_debug("CPU %d exiting\n", policy->cpu); + policy->fast_switch_possible = false; + return 0; +} + +static void amd_pstate_epp_init(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; + u32 max_perf, min_perf; + u64 value; + s16 epp; + + max_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); + + value = READ_ONCE(cpudata->cppc_req_cached); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + min_perf = max_perf; + + /* Initial min/max values for CPPC Performance Controls Register */ + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(max_perf); + + /* CPPC EPP feature require to set zero to the desire perf bit */ + value &= ~AMD_CPPC_DES_PERF(~0L); + value |= AMD_CPPC_DES_PERF(0); + + if (cpudata->epp_policy == cpudata->policy) + goto skip_epp; + + cpudata->epp_policy = cpudata->policy; + + /* Get BIOS pre-defined epp value */ + epp = amd_pstate_get_epp(cpudata, value); + if (epp < 0) { + /** + * This return value can only be negative for shared_memory + * systems where EPP register read/write not supported. + */ + goto skip_epp; + } + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; + + /* Set initial EPP value */ + if (boot_cpu_has(X86_FEATURE_CPPC)) { + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + } + + WRITE_ONCE(cpudata->cppc_req_cached, value); + amd_pstate_set_epp(cpudata, epp); +skip_epp: + cpufreq_cpu_put(policy); +} + +static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", + policy->cpuinfo.max_freq, policy->max); + + cpudata->policy = policy->policy; + + amd_pstate_epp_init(policy->cpu); + + return 0; +} + +static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) +{ + struct cppc_perf_ctrls perf_ctrls; + u64 value, max_perf; + int ret; + + ret = amd_pstate_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + + value = READ_ONCE(cpudata->cppc_req_cached); + max_perf = READ_ONCE(cpudata->highest_perf); + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = max_perf; + perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } +} + +static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); + + if (cppc_state == AMD_PSTATE_ACTIVE) { + amd_pstate_epp_reenable(cpudata); + cpudata->suspended = false; + } + + return 0; +} + +static void amd_pstate_epp_offline(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; + int min_perf; + u64 value; + + min_perf = READ_ONCE(cpudata->lowest_perf); + value = READ_ONCE(cpudata->cppc_req_cached); + + mutex_lock(&amd_pstate_limits_lock); + if (boot_cpu_has(X86_FEATURE_CPPC)) { + cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; + + /* Set max perf same as min perf */ + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(min_perf); + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.desired_perf = 0; + perf_ctrls.max_perf = min_perf; + perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } + mutex_unlock(&amd_pstate_limits_lock); +} + +static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); + + if (cpudata->suspended) + return 0; + + if (cppc_state == AMD_PSTATE_ACTIVE) + amd_pstate_epp_offline(policy); + + return 0; +} + +static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) +{ + cpufreq_verify_within_cpu_limits(policy); + pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); + return 0; +} + +static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int ret; + + /* avoid suspending when EPP is not enabled */ + if (cppc_state != AMD_PSTATE_ACTIVE) + return 0; + + /* set this flag to avoid setting core offline*/ + cpudata->suspended = true; + + /* disable CPPC in lowlevel firmware */ + ret = amd_pstate_enable(false); + if (ret) + pr_err("failed to suspend, return %d\n", ret); + + return 0; +} + +static int amd_pstate_epp_resume(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->suspended) { + mutex_lock(&amd_pstate_limits_lock); + + /* enable amd pstate from suspend state*/ + amd_pstate_epp_reenable(cpudata); + + mutex_unlock(&amd_pstate_limits_lock); + + cpudata->suspended = false; + } + + return 0; +} + static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, @@ -617,6 +1318,20 @@ static struct cpufreq_driver amd_pstate_driver = { .attr = amd_pstate_attr, }; +static struct cpufreq_driver amd_pstate_epp_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = amd_pstate_epp_verify_policy, + .setpolicy = amd_pstate_epp_set_policy, + .init = amd_pstate_epp_cpu_init, + .exit = amd_pstate_epp_cpu_exit, + .offline = amd_pstate_epp_cpu_offline, + .online = amd_pstate_epp_cpu_online, + .suspend = amd_pstate_epp_suspend, + .resume = amd_pstate_epp_resume, + .name = "amd_pstate_epp", + .attr = amd_pstate_epp_attr, +}; + static int __init amd_pstate_init(void) { int ret; @@ -626,10 +1341,10 @@ static int __init amd_pstate_init(void) /* * by default the pstate driver is disabled to load * enable the amd_pstate passive mode driver explicitly - * with amd_pstate=passive in kernel command line + * with amd_pstate=passive or other modes in kernel command line */ - if (!cppc_load) { - pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n"); + if (cppc_state == AMD_PSTATE_DISABLE) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } @@ -645,7 +1360,8 @@ static int __init amd_pstate_init(void) /* capability check */ if (boot_cpu_has(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); - amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf; + if (cppc_state != AMD_PSTATE_ACTIVE) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; } else { pr_debug("AMD CPPC shared memory based functionality is supported\n"); static_call_update(amd_pstate_enable, cppc_enable); @@ -656,31 +1372,63 @@ static int __init amd_pstate_init(void) /* enable amd pstate feature */ ret = amd_pstate_enable(true); if (ret) { - pr_err("failed to enable amd-pstate with return %d\n", ret); + pr_err("failed to enable with return %d\n", ret); return ret; } - ret = cpufreq_register_driver(&amd_pstate_driver); + ret = cpufreq_register_driver(current_pstate_driver); if (ret) - pr_err("failed to register amd_pstate_driver with return %d\n", - ret); + pr_err("failed to register with return %d\n", ret); + + amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj); + if (!amd_pstate_kobj) { + ret = -EINVAL; + pr_err("global sysfs registration failed.\n"); + goto kobject_free; + } + ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group); + if (ret) { + pr_err("sysfs attribute export failed with error %d.\n", ret); + goto global_attr_free; + } + + return ret; + +global_attr_free: + kobject_put(amd_pstate_kobj); +kobject_free: + cpufreq_unregister_driver(current_pstate_driver); return ret; } device_initcall(amd_pstate_init); static int __init amd_pstate_param(char *str) { + size_t size; + int mode_idx; + if (!str) return -EINVAL; - if (!strcmp(str, "disable")) { - cppc_load = 0; - pr_info("driver is explicitly disabled\n"); - } else if (!strcmp(str, "passive")) - cppc_load = 1; + size = strlen(str); + mode_idx = get_mode_idx_from_str(str, size); - return 0; + if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { + cppc_state = mode_idx; + if (cppc_state == AMD_PSTATE_DISABLE) + pr_info("driver is explicitly disabled\n"); + + if (cppc_state == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + + if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) + current_pstate_driver = &amd_pstate_driver; + + return 0; + } + + return -EINVAL; } early_param("amd_pstate", amd_pstate_param); diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 4153150e20db..ffea6402189d 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev) static int brcm_avs_cpufreq_remove(struct platform_device *pdev) { - int ret; - - ret = cpufreq_unregister_driver(&brcm_avs_driver); - WARN_ON(ret); + cpufreq_unregister_driver(&brcm_avs_driver); brcm_avs_prepare_uninit(pdev); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7e56a42750ea..85a0bea2dbf1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver); * Returns zero if successful, and -EINVAL if the cpufreq_driver is * currently not initialised. */ -int cpufreq_unregister_driver(struct cpufreq_driver *driver) +void cpufreq_unregister_driver(struct cpufreq_driver *driver) { unsigned long flags; - if (!cpufreq_driver || (driver != cpufreq_driver)) - return -EINVAL; + if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver))) + return; pr_debug("unregistering driver %s\n", driver->name); @@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) write_unlock_irqrestore(&cpufreq_driver_lock, flags); cpus_read_unlock(); - - return 0; } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index f80339779084..f21a9e3df53d 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -317,7 +317,9 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); + cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); + + return 0; } static const struct of_device_id mtk_cpufreq_hw_match[] = { diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 1b50df06c6bc..81649a1969b6 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev) static int omap_cpufreq_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&omap_driver); + cpufreq_unregister_driver(&omap_driver); + + return 0; } static struct platform_driver omap_cpufreq_platdrv = { diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index d3f55ca06ed3..2f581d2d617d 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -770,7 +770,9 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); + cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); + + return 0; } static struct platform_driver qcom_cpufreq_hw_driver = { diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index c5614444031f..6126c977ece0 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -108,12 +108,15 @@ struct cppc_perf_caps { u32 lowest_nonlinear_perf; u32 lowest_freq; u32 nominal_freq; + u32 energy_perf; + bool auto_sel; }; struct cppc_perf_ctrls { u32 max_perf; u32 min_perf; u32 desired_perf; + u32 energy_perf; }; struct cppc_perf_fb_ctrs { @@ -149,6 +152,10 @@ extern bool cpc_ffh_supported(void); extern bool cpc_supported_by_cpu(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); +extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); +extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); +extern int cppc_set_auto_sel(int cpu, bool enable); #else /* !CONFIG_ACPI_CPPC_LIB */ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) { @@ -202,6 +209,22 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) { return -ENOTSUPP; } +static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) +{ + return -ENOTSUPP; +} +static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) +{ + return -ENOTSUPP; +} +static inline int cppc_set_auto_sel(int cpu, bool enable) +{ + return -ENOTSUPP; +} +static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) +{ + return -ENOTSUPP; +} #endif /* !CONFIG_ACPI_CPPC_LIB */ #endif /* _CPPC_ACPI_H*/ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 1c4b8659f171..c10ebf8c42e6 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -12,6 +12,11 @@ #include +#define AMD_CPPC_EPP_PERFORMANCE 0x00 +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF +#define AMD_CPPC_EPP_POWERSAVE 0xFF + /********************************************************************* * AMD P-state INTERFACE * *********************************************************************/ @@ -47,6 +52,10 @@ struct amd_aperf_mperf { * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @epp_policy: Last saved policy used to set energy-performance preference + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value + * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. @@ -72,6 +81,31 @@ struct amd_cpudata { u64 freq; bool boost_supported; + + /* EPP feature related attributes*/ + s16 epp_policy; + s16 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; }; +/* + * enum amd_pstate_mode - driver working mode of amd pstate + */ +enum amd_pstate_mode { + AMD_PSTATE_DISABLE = 0, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, + AMD_PSTATE_GUIDED, + AMD_PSTATE_MAX, +}; + +static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", + NULL, +}; #endif /* _LINUX_AMD_PSTATE_H */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6a94a6eaad27..65623233ab2f 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -448,7 +448,7 @@ struct cpufreq_driver { #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6) int cpufreq_register_driver(struct cpufreq_driver *driver_data); -int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); +void cpufreq_unregister_driver(struct cpufreq_driver *driver_data); bool cpufreq_driver_test_flags(u16 flags); const char *cpufreq_get_current_driver(void); -- 2.40.0 From 0cc7a04fbadb3ea4948f19e025e158f486f84663 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 17 Mar 2023 17:40:46 +0100 Subject: [PATCH 09/15] ksm Signed-off-by: Peter Jung --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/ksm.h | 4 + include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 5 +- kernel/sys_ni.c | 1 + mm/ksm.c | 88 +++++++++------ mm/madvise.c | 113 ++++++++++++++++++++ 24 files changed, 198 insertions(+), 34 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 8ebacf37a8cf..c9d25f85d86d 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -490,3 +490,4 @@ 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv 560 common set_mempolicy_home_node sys_ni_syscall +561 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index ac964612d8b0..90933eabe115 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -464,3 +464,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 037feba03a51..64a514f90131 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 451 +#define __NR_compat_syscalls 452 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 604a2053d006..91f2bb7199af 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_pmadv_ksm 451 +__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 72c929d9902b..0d5b1d14b2b5 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -371,3 +371,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index b1f3940bc298..5ccf925567da 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -450,3 +450,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 820145e47350..6b76208597f3 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -456,3 +456,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 253ff994ed2e..e4aeedb17c38 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -389,3 +389,4 @@ 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv 450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node +451 n32 pmadv_ksm sys_pmadv_ksm diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 3f1886ad9d80..fe88db51efa0 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -365,3 +365,4 @@ 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 n64 pmadv_ksm sys_pmadv_ksm diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 8f243e35a7b2..674cb940bd15 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -438,3 +438,4 @@ 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv 450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node +451 o32 pmadv_ksm sys_pmadv_ksm diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 0e42fceb2d5e..5914aa460255 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -448,3 +448,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index a0be127475b1..347894da4eb6 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -537,3 +537,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 799147658dee..1cd523748bd2 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm sys_pmadv_ksm diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 2de85c977f54..cfc75fa43eae 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 4398cc6fb68d..d2c0a6426f6b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -496,3 +496,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..331aaf1a782f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 pmadv_ksm sys_pmadv_ksm diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..14902db4c01f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 52c94ab5c205..1518e261d882 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -421,3 +421,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 7e232ba59b86..632a1a792ebb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -16,6 +16,10 @@ #include #ifdef CONFIG_KSM +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *vm_flags); +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long *vm_flags); int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); int __ksm_enter(struct mm_struct *mm); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33a0ee3bcb2e..62f14e800839 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -919,6 +919,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); +asmlinkage long sys_pmadv_ksm(int pidfd, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 45fa180cc56a..40f7e6d04af0 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_pmadv_ksm 451 +__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..810e1fcaff94 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,6 +292,7 @@ COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); +COND_SYSCALL(pmadv_ksm); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL(get_mempolicy); diff --git a/mm/ksm.c b/mm/ksm.c index ee60890cf9b1..bc920121bce9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2582,54 +2582,78 @@ static int ksm_scan_thread(void *nothing) return 0; } -int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *vm_flags) { - struct mm_struct *mm = vma->vm_mm; int err; - switch (advice) { - case MADV_MERGEABLE: - /* - * Be somewhat over-protective for now! - */ - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_MIXEDMAP)) - return 0; /* just ignore the advice */ + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP)) + return 0; /* just ignore the advice */ - if (vma_is_dax(vma)) - return 0; + if (vma_is_dax(vma)) + return 0; #ifdef VM_SAO if (*vm_flags & VM_SAO) return 0; #endif #ifdef VM_SPARC_ADI - if (*vm_flags & VM_SPARC_ADI) - return 0; + if (*vm_flags & VM_SPARC_ADI) + return 0; #endif - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { - err = __ksm_enter(mm); - if (err) - return err; - } + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } - *vm_flags |= VM_MERGEABLE; - break; + *vm_flags |= VM_MERGEABLE; - case MADV_UNMERGEABLE: - if (!(*vm_flags & VM_MERGEABLE)) - return 0; /* just ignore the advice */ + return 0; +} - if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end); - if (err) - return err; - } +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long *vm_flags) +{ + int err; + + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } - *vm_flags &= ~VM_MERGEABLE; + *vm_flags &= ~VM_MERGEABLE; + + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + err = ksm_madvise_merge(mm, vma, vm_flags); + if (err) + return err; + break; + + case MADV_UNMERGEABLE: + err = ksm_madvise_unmerge(vma, start, end, vm_flags); + if (err) + return err; break; } diff --git a/mm/madvise.c b/mm/madvise.c index b6ea204d4e23..0064dcafb812 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1527,3 +1527,116 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, out: return ret; } + +SYSCALL_DEFINE3(pmadv_ksm, int, pidfd, int, behaviour, unsigned int, flags) +{ +#ifdef CONFIG_KSM + ssize_t ret; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + unsigned int f_flags; + struct vm_area_struct *vma; + struct vma_iterator vmi; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + switch (behaviour) { + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + break; + default: + ret = -EINVAL; + goto out; + break; + } + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto out; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + /* Require CAP_SYS_NICE for influencing process performance. */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + + if (mmap_write_lock_killable(mm)) { + ret = -EINTR; + goto release_mm; + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + switch (behaviour) { + case MADV_MERGEABLE: + ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags); + break; + case MADV_UNMERGEABLE: + ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags); + break; + default: + /* look, ma, no brain */ + break; + } + if (ret) + break; + } + + mmap_write_unlock(mm); + +release_mm: + mmput(mm); +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +out: + return ret; +#else /* CONFIG_KSM */ + return -ENOSYS; +#endif /* CONFIG_KSM */ +} + +#ifdef CONFIG_KSM +static ssize_t ksm_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", __NR_pmadv_ksm); +} +static struct kobj_attribute pmadv_ksm_attr = __ATTR_RO(ksm); + +static struct attribute *pmadv_sysfs_attrs[] = { + &pmadv_ksm_attr.attr, + NULL, +}; + +static const struct attribute_group pmadv_sysfs_attr_group = { + .attrs = pmadv_sysfs_attrs, + .name = "pmadv", +}; + +static int __init pmadv_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &pmadv_sysfs_attr_group); +} +subsys_initcall(pmadv_sysfs_init); +#endif /* CONFIG_KSM */ -- 2.40.0 From 330db0b8fd28b6036a1dc14bfbad31f64999d8b5 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 20 Apr 2023 19:38:17 +0200 Subject: [PATCH 10/15] maple-lru Signed-off-by: Peter Jung --- Documentation/mm/multigen_lru.rst | 128 ++- arch/arm64/include/asm/kvm_host.h | 7 + arch/arm64/include/asm/kvm_pgtable.h | 8 + arch/arm64/include/asm/stage2_pgtable.h | 43 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/pgtable.c | 51 +- arch/arm64/kvm/mmu.c | 77 +- arch/powerpc/include/asm/kvm_host.h | 18 + arch/powerpc/include/asm/kvm_ppc.h | 14 +- arch/powerpc/kvm/book3s.c | 7 + arch/powerpc/kvm/book3s.h | 2 + arch/powerpc/kvm/book3s_64_mmu_radix.c | 78 +- arch/powerpc/kvm/book3s_hv.c | 10 +- arch/s390/mm/hugetlbpage.c | 2 +- arch/s390/mm/mmap.c | 2 +- arch/x86/include/asm/kvm_host.h | 27 + arch/x86/kvm/mmu/spte.h | 12 - arch/x86/kvm/mmu/tdp_mmu.c | 41 + fs/hugetlbfs/inode.c | 2 +- include/linux/fs.h | 2 + include/linux/kvm_host.h | 29 + include/linux/maple_tree.h | 8 +- include/linux/memcontrol.h | 10 + include/linux/mm_inline.h | 19 +- include/linux/mmu_notifier.h | 40 + include/linux/mmzone.h | 138 ++- lib/maple_tree.c | 191 ++-- mm/fadvise.c | 5 +- mm/memcontrol.c | 12 + mm/memory.c | 7 +- mm/mmap.c | 73 +- mm/mmu_notifier.c | 26 + mm/nommu.c | 8 +- mm/page_alloc.c | 1 + mm/rmap.c | 48 +- mm/vmscan.c | 1316 +++++++++++++++-------- mm/workingset.c | 4 +- tools/testing/radix-tree/maple.c | 56 +- virt/kvm/kvm_main.c | 58 + 39 files changed, 1805 insertions(+), 776 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d7062c6a8946..52ed5092022f 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -89,21 +89,22 @@ variables are monotonically increasing. Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` bits in order to fit into the gen counter in ``folio->flags``. Each -truncated generation number is an index to ``lrugen->lists[]``. The +truncated generation number is an index to ``lrugen->folios[]``. The sliding window technique is used to track at least ``MIN_NR_GENS`` and at most ``MAX_NR_GENS`` generations. The gen counter stores a value within ``[1, MAX_NR_GENS]`` while a page is on one of -``lrugen->lists[]``; otherwise it stores zero. +``lrugen->folios[]``; otherwise it stores zero. Each generation is divided into multiple tiers. A page accessed ``N`` times through file descriptors is in tier ``order_base_2(N)``. Unlike -generations, tiers do not have dedicated ``lrugen->lists[]``. In +generations, tiers do not have dedicated ``lrugen->folios[]``. In contrast to moving across generations, which requires the LRU lock, moving across tiers only involves atomic operations on ``folio->flags`` and therefore has a negligible cost. A feedback loop modeled after the PID controller monitors refaults over all the tiers from anon and file types and decides which tiers from which types to -evict or protect. +evict or protect. The desired effect is to balance refault percentages +between anon and file types proportional to the swappiness level. There are two conceptually independent procedures: the aging and the eviction. They form a closed-loop system, i.e., the page reclaim. @@ -127,7 +128,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. Eviction -------- The eviction consumes old generations. Given an ``lruvec``, it -increments ``min_seq`` when ``lrugen->lists[]`` indexed by +increments ``min_seq`` when ``lrugen->folios[]`` indexed by ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to evict from, it first compares ``min_seq[]`` to select the older type. If both types are equally old, it selects the one whose first tier has @@ -141,15 +142,124 @@ loop has detected outlying refaults from the tier this page is in. To this end, the feedback loop uses the first tier as the baseline, for the reason stated earlier. +Working set protection +---------------------- +Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is +set, an ``lruvec`` is protected from the eviction when its oldest +generation was born within ``lru_gen_min_ttl`` milliseconds. In other +words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds +from getting evicted. The OOM killer is triggered if this working set +cannot be kept in memory. + +This time-based approach has the following advantages: + +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + +``mm_struct`` list +------------------ +An ``mm_struct`` list is maintained for each memcg, and an +``mm_struct`` follows its owner task to the new memcg when this task +is migrated. + +A page table walker iterates ``lruvec_memcg()->mm_list`` and calls +``walk_page_range()`` with each ``mm_struct`` on this list to scan +PTEs. When multiple page table walkers iterate the same list, each of +them gets a unique ``mm_struct``, and therefore they can run in +parallel. + +Page table walkers ignore any misplaced pages, e.g., if an +``mm_struct`` was migrated, pages left in the previous memcg will be +ignored when the current memcg is under reclaim. Similarly, page table +walkers will ignore pages from nodes other than the one under reclaim. + +This infrastructure also tracks the usage of ``mm_struct`` between +context switches so that page table walkers can skip processes that +have been sleeping since the last iteration. + +Rmap/PT walk feedback +--------------------- +Searching the rmap for PTEs mapping each page on an LRU list (to test +and clear the accessed bit) can be expensive because pages from +different VMAs (PA space) are not cache friendly to the rmap (VA +space). For workloads mostly using mapped pages, searching the rmap +can incur the highest CPU cost in the reclaim path. + +``lru_gen_look_around()`` exploits spatial locality to reduce the +trips into the rmap. It scans the adjacent PTEs of a young PTE and +promotes hot pages. If the scan was done cacheline efficiently, it +adds the PMD entry pointing to the PTE table to the Bloom filter. This +forms a feedback loop between the eviction and the aging. + +Bloom filters +------------- +Bloom filters are a space and memory efficient data structure for set +membership test, i.e., test if an element is not in the set or may be +in the set. + +In the eviction path, specifically, in ``lru_gen_look_around()``, if a +PMD has a sufficient number of hot pages, its address is placed in the +filter. In the aging path, set membership means that the PTE range +will be scanned for young pages. + +Note that Bloom filters are probabilistic on set membership. If a test +is false positive, the cost is an additional scan of a range of PTEs, +which may yield hot pages anyway. Parameters of the filter itself can +control the false positive rate in the limit. + +PID controller +-------------- +A feedback loop modeled after the Proportional-Integral-Derivative +(PID) controller monitors refaults over anon and file types and +decides which type to evict when both types are available from the +same generation. + +The PID controller uses generations rather than the wall clock as the +time domain because a CPU can scan pages at different rates under +varying memory pressure. It calculates a moving average for each new +generation to avoid being permanently locked in a suboptimal state. + +Memcg LRU +--------- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +since each node and memcg combination has an LRU of folios (see +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of +global reclaim, which is critical to system-wide memory overcommit in +data centers. Note that memcg LRU only applies to global reclaim. + +The basic structure of an memcg LRU can be understood by an analogy to +the active/inactive LRU (of folios): + +1. It has the young and the old (generations), i.e., the counterparts + to the active and the inactive; +2. The increment of ``max_seq`` triggers promotion, i.e., the + counterpart to activation; +3. Other events trigger similar operations, e.g., offlining an memcg + triggers demotion, i.e., the counterpart to deactivation. + +In terms of global reclaim, it has two distinct features: + +1. Sharding, which allows each thread to start at a random memcg (in + the old generation) and improves parallelism; +2. Eventual fairness, which allows direct reclaim to bail out at will + and reduces latency without affecting fairness over some time. + +In terms of traversing memcgs during global reclaim, it improves the +best-case complexity from O(n) to O(1) and does not affect the +worst-case complexity O(n). Therefore, on average, it has a sublinear +complexity. + Summary ------- -The multi-gen LRU can be disassembled into the following parts: +The multi-gen LRU (of folios) can be disassembled into the following +parts: * Generations * Rmap walks -* Page table walks -* Bloom filters -* PID controller +* Page table walks via ``mm_struct`` list +* Bloom filters for rmap/PT walk feedback +* PID controller for refault feedback The aging and the eviction form a producer-consumer model; specifically, the latter drives the former by the sliding window over diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 35a159d131b5..572bcd321586 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1031,4 +1031,11 @@ static inline void kvm_hyp_reserve(void) { } void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); +/* see the comments on the generic kvm_arch_has_test_clear_young() */ +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young +static inline bool kvm_arch_has_test_clear_young(void) +{ + return IS_ENABLED(CONFIG_KVM) && cpu_has_hw_af() && !is_protected_kvm_enabled(); +} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 63f81b27a4e3..8c9a04388c88 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -105,6 +105,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level) * @put_page: Decrement the refcount on a page. When the * refcount reaches 0 the page is automatically * freed. + * @put_page_rcu: RCU variant of put_page(). * @page_count: Return the refcount of a page. * @phys_to_virt: Convert a physical address into a virtual * address mapped in the current context. @@ -122,6 +123,7 @@ struct kvm_pgtable_mm_ops { void (*free_removed_table)(void *addr, u32 level); void (*get_page)(void *addr); void (*put_page)(void *addr); + void (*put_page_rcu)(void *addr); int (*page_count)(void *addr); void* (*phys_to_virt)(phys_addr_t phys); phys_addr_t (*virt_to_phys)(void *addr); @@ -188,6 +190,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, * children. * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared * with other software walkers. + * + * kvm_arch_test_clear_young() is a special case. It relies on two + * techniques, RCU and cmpxchg, to safely test and clear the accessed + * bit without taking the MMU lock. The former protects KVM page tables + * from being freed while the latter clears the accessed bit atomically + * against both the hardware and other software page table walkers. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index c8dca8ae359c..350437661d4b 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -30,4 +30,47 @@ */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) +#define KVM_PTE_TYPE BIT(1) +#define KVM_PTE_TYPE_BLOCK 0 +#define KVM_PTE_TYPE_PAGE 1 +#define KVM_PTE_TYPE_TABLE 1 + +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) + +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) +#define KVM_MAX_OWNER_ID 1 + +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e57f8ae09387..0b71117ffc7e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -191,6 +191,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) */ void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_free_stage2_pgd(&kvm->arch.mmu); bitmap_free(kvm->arch.pmu_filter); free_cpumask_var(kvm->arch.supported_cpus); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b11cf2c618a6..8d65ee4767f1 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -12,49 +12,6 @@ #include -#define KVM_PTE_TYPE BIT(1) -#define KVM_PTE_TYPE_BLOCK 0 -#define KVM_PTE_TYPE_PAGE 1 -#define KVM_PTE_TYPE_TABLE 1 - -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ - KVM_PTE_LEAF_ATTR_HI_S2_XN) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -994,8 +951,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); - if (childp) - mm_ops->put_page(childp); + if (childp) { + if (mm_ops->put_page_rcu) + mm_ops->put_page_rcu(childp); + else + mm_ops->put_page(childp); + } return 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3c24178bd493..f5b05287daf4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -171,6 +171,21 @@ static int kvm_host_page_count(void *addr) return page_count(virt_to_page(addr)); } +static void kvm_s2_rcu_put_page(struct rcu_head *head) +{ + put_page(container_of(head, struct page, rcu_head)); +} + +static void kvm_s2_put_page_rcu(void *addr) +{ + struct page *page = virt_to_page(addr); + + if (kvm_host_page_count(addr) == 1) + kvm_account_pgtable_pages(addr, -1); + + call_rcu(&page->rcu_head, kvm_s2_rcu_put_page); +} + static phys_addr_t kvm_host_pa(void *addr) { return __pa(addr); @@ -703,6 +718,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .free_removed_table = stage2_free_removed_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, + .put_page_rcu = kvm_s2_put_page_rcu, .page_count = kvm_host_page_count, .phys_to_virt = kvm_host_va, .virt_to_phys = kvm_host_pa, @@ -1653,6 +1669,66 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return pte_valid(pte) && pte_young(pte); } +struct test_clear_young_arg { + struct kvm_gfn_range *range; + gfn_t lsb_gfn; + unsigned long *bitmap; +}; + +static int stage2_test_clear_young(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags flags) +{ + struct test_clear_young_arg *arg = ctx->arg; + gfn_t gfn = ctx->addr / PAGE_SIZE; + kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; + + VM_WARN_ON_ONCE(!page_count(virt_to_page(ctx->ptep))); + VM_WARN_ON_ONCE(gfn < arg->range->start || gfn >= arg->range->end); + + if (!kvm_pte_valid(new)) + return 0; + + if (new == ctx->old) + return 0; + + /* see the comments on the generic kvm_arch_has_test_clear_young() */ + if (__test_and_change_bit(arg->lsb_gfn - gfn, arg->bitmap)) + cmpxchg64(ctx->ptep, ctx->old, new); + + return 0; +} + +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap) +{ + u64 start = range->start * PAGE_SIZE; + u64 end = range->end * PAGE_SIZE; + struct test_clear_young_arg arg = { + .range = range, + .lsb_gfn = lsb_gfn, + .bitmap = bitmap, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_test_clear_young, + .arg = &arg, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + BUILD_BUG_ON(is_hyp_code()); + + if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) + return false; + + /* see the comments on kvm_pgtable_walk_flags */ + rcu_read_lock(); + + kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker); + + rcu_read_unlock(); + + return true; +} + bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) @@ -1877,7 +1953,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_free_stage2_pgd(&kvm->arch.mmu); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index caea15dcb91d..996850029ce0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -886,4 +886,22 @@ static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline int kvmppc_radix_possible(void) +{ + return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); +} + +static inline bool kvmhv_on_pseries(void) +{ + return IS_ENABLED(CONFIG_PPC_PSERIES) && !cpu_has_feature(CPU_FTR_HVMODE); +} + +/* see the comments on the generic kvm_arch_has_test_clear_young() */ +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young +static inline bool kvm_arch_has_test_clear_young(void) +{ + return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && + kvmppc_radix_possible() && !kvmhv_on_pseries(); +} + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eae9619b6190..0bb772fc12b1 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -277,6 +277,8 @@ struct kvmppc_ops { bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap); bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); @@ -580,18 +582,6 @@ static inline bool kvm_hv_mode_active(void) { return false; } #endif -#ifdef CONFIG_PPC_PSERIES -static inline bool kvmhv_on_pseries(void) -{ - return !cpu_has_feature(CPU_FTR_HVMODE); -} -#else -static inline bool kvmhv_on_pseries(void) -{ - return false; -} -#endif - #ifdef CONFIG_KVM_XICS static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6d525285dbe8..f4cf330e3e81 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -877,6 +877,13 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return kvm->arch.kvm_ops->test_age_gfn(kvm, range); } +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap) +{ + return kvm->arch.kvm_ops->test_clear_young && + kvm->arch.kvm_ops->test_clear_young(kvm, range, lsb_gfn, bitmap); +} + bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 58391b4b32ed..fe9cac423817 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -12,6 +12,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap); extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 9d3743ca16d5..8476646c554c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -1083,6 +1083,78 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, return ref; } +bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap) +{ + bool success; + gfn_t gfn = range->start; + + if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) + return false; + + /* + * This function relies on two techniques, RCU and cmpxchg, to safely + * test and clear the accessed bit without taking the MMU lock. The + * former protects KVM page tables from being freed while the latter + * clears the accessed bit atomically against both the hardware and + * other software page table walkers. + */ + rcu_read_lock(); + + success = kvm_is_radix(kvm); + if (!success) + goto unlock; + + /* + * case 1: this function kvmppc_switch_mmu_to_hpt() + * + * rcu_read_lock() + * test kvm_is_radix() kvm->arch.radix = 0 + * use kvm->arch.pgtable + * rcu_read_unlock() + * synchronize_rcu() + * kvmppc_free_radix() + * + * + * case 2: this function kvmppc_switch_mmu_to_radix() + * + * kvmppc_init_vm_radix() + * smp_wmb() + * test kvm_is_radix() kvm->arch.radix = 1 + * smp_rmb() + * use kvm->arch.pgtable + */ + smp_rmb(); + + while (gfn < range->end) { + pte_t *ptep; + pte_t old, new; + unsigned int shift; + + ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift); + if (!ptep) + goto next; + + VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep))); + + old = READ_ONCE(*ptep); + if (!pte_present(old) || !pte_young(old)) + goto next; + + new = pte_mkold(old); + + /* see the comments on the generic kvm_arch_has_test_clear_young() */ + if (__test_and_change_bit(lsb_gfn - gfn, bitmap)) + pte_xchg(ptep, old, new); +next: + gfn += shift ? BIT(shift - PAGE_SHIFT) : 1; + } +unlock: + rcu_read_unlock(); + + return success; +} + /* Returns the number of PAGE_SIZE pages that are dirty */ static int kvm_radix_test_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, int pagenum) @@ -1464,13 +1536,15 @@ int kvmppc_radix_init(void) { unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; - kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, + SLAB_TYPESAFE_BY_RCU, pte_ctor); if (!kvm_pte_cache) return -ENOMEM; size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; - kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); + kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, + SLAB_TYPESAFE_BY_RCU, pmd_ctor); if (!kvm_pmd_cache) { kmem_cache_destroy(kvm_pte_cache); return -ENOMEM; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ba68dd6190b..17b415661282 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) spin_lock(&kvm->mmu_lock); kvm->arch.radix = 0; spin_unlock(&kvm->mmu_lock); + /* see the comments in kvmhv_test_clear_young() */ + synchronize_rcu(); kvmppc_free_radix(kvm); lpcr = LPCR_VPM1; @@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) if (err) return err; kvmppc_rmap_reset(kvm); + /* see the comments in kvmhv_test_clear_young() */ + smp_wmb(); /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 1; @@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = { .unmap_gfn_range = kvm_unmap_gfn_range_hv, .age_gfn = kvm_age_gfn_hv, .test_age_gfn = kvm_test_age_gfn_hv, + .test_clear_young = kvmhv_test_clear_young, .set_spte_gfn = kvm_set_spte_gfn_hv, .free_memslot = kvmppc_core_free_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, @@ -6225,11 +6230,6 @@ static int kvm_init_subcore_bitmap(void) return 0; } -static int kvmppc_radix_possible(void) -{ - return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); -} - static int kvmppc_book3s_init_hv(void) { int r; diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c299a18273ff..c718f2a0de94 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -273,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = current->mm->mmap_base; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 3327c47bc181..fc9a7dc26c5e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -136,7 +136,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; if (filp || (flags & MAP_SHARED)) info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 24480b4f1c57..a076e337b3db 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1367,6 +1367,12 @@ struct kvm_arch { * the MMU lock in read mode + the tdp_mmu_pages_lock or * the MMU lock in write mode * + * kvm_arch_test_clear_young() is a special case. It relies on two + * techniques, RCU and cmpxchg, to safely test and clear the accessed + * bit without taking the MMU lock. The former protects KVM page tables + * from being freed while the latter clears the accessed bit atomically + * against both the hardware and other software page table walkers. + * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the * count to zero should removed the root from the list and clean @@ -2174,4 +2180,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) +extern u64 __read_mostly shadow_accessed_mask; + +/* + * Returns true if A/D bits are supported in hardware and are enabled by KVM. + * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can + * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the + * scenario where KVM is using A/D bits for L1, but not L2. + */ +static inline bool kvm_ad_enabled(void) +{ + return shadow_accessed_mask; +} + +/* see the comments on the generic kvm_arch_has_test_clear_young() */ +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young +static inline bool kvm_arch_has_test_clear_young(void) +{ + return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) && + (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled)); +} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6f54dc9409c9..0dc7fed1f3fd 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; -extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; @@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte) return !!(pte & SPTE_MMU_PRESENT_MASK); } -/* - * Returns true if A/D bits are supported in hardware and are enabled by KVM. - * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the - * scenario where KVM is using A/D bits for L1, but not L2. - */ -static inline bool kvm_ad_enabled(void) -{ - return !!shadow_accessed_mask; -} - static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d6df38d371a0..9028e09f1aab 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); } +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap) +{ + struct kvm_mmu_page *root; + + if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) + return false; + + if (kvm_memslots_have_rmaps(kvm)) + return false; + + /* see the comments on kvm_arch->tdp_mmu_roots */ + rcu_read_lock(); + + list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { + struct tdp_iter iter; + + if (kvm_mmu_page_as_id(root) != range->slot->as_id) + continue; + + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { + u64 *sptep = rcu_dereference(iter.sptep); + u64 new_spte = iter.old_spte & ~shadow_accessed_mask; + + VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep))); + VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end); + + if (new_spte == iter.old_spte) + continue; + + /* see the comments on the generic kvm_arch_has_test_clear_young() */ + if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap)) + cmpxchg64(sptep, iter.old_spte, new_spte); + } + } + + rcu_read_unlock(); + + return true; +} + static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 790d2727141a..07297fac7de8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index c1769a2c5d70..d353c262d669 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_NOREUSE ((__force fmode_t)0x800000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f26b244f6d0..df46fc815c8b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2281,4 +2281,33 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 +/* + * Architectures that implement kvm_arch_test_clear_young() should override + * kvm_arch_has_test_clear_young(). + * + * kvm_arch_has_test_clear_young() is allowed to return false positive. It can + * return true if kvm_arch_test_clear_young() is supported but disabled due to + * some runtime constraint. In this case, kvm_arch_test_clear_young() should + * return false. + * + * The last parameter to kvm_arch_test_clear_young() is a bitmap with the + * following specifications: + * 1. The offset of each bit is relative to the second to the last parameter + * lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is to + * better suit batching while forward looping. + * 2. For each KVM PTE with the accessed bit set, the implementation should flip + * the corresponding bit in the bitmap. It should only clear the accessed bit + * if the old value is 1. This allows the caller to test or test and clear + * the accessed bit. + */ +#ifndef kvm_arch_has_test_clear_young +static inline bool kvm_arch_has_test_clear_young(void) +{ + return false; +} +#endif + +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap); + #endif diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e594db58a0f1..a7bf58fd7cc6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -12,7 +12,6 @@ #include #include /* #define CONFIG_MAPLE_RCU_DISABLED */ -/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ /* * Allocated nodes are mutable until they have been inserted into the tree, @@ -456,7 +455,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); @@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) return mas->node == MAS_PAUSE; } -void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); -void mas_dup_store(struct ma_state *mas, void *entry); - /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, @@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas) * entry. * * Note: may return the zero entry. - * */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) @@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) } static inline unsigned int mt_height(const struct maple_tree *mt) - { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 85dc9b88ea37..8e0be0680005 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) percpu_ref_put(&objcg->refcnt); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ff3f3f23f649..de1e622dd366 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); @@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) - list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else - list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, #endif } +static inline bool vma_has_recency(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) + return false; + + if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) + return false; + + return true; +} + #endif diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d6c06e140277..521f71ad0467 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -122,6 +122,11 @@ struct mmu_notifier_ops { struct mm_struct *mm, unsigned long address); + /* see the comments on mmu_notifier_test_clear_young() */ + bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned long *bitmap); + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. @@ -391,6 +396,9 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); +extern int __mmu_notifier_test_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool fallback, unsigned long *bitmap); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, @@ -433,6 +441,31 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm, return 0; } +/* + * This function always returns 0 if fallback is not allowed. If fallback + * happens, its return value is similar to that of mmu_notifier_clear_young(). + * + * The bitmap has the following specifications: + * 1. The number of bits should be at least (end-start)/PAGE_SIZE. + * 2. The offset of each bit is relative to the end. E.g., the offset + * corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better suit + * batching while forward looping. + * 3. For each KVM PTE with the accessed bit set (young), this function flips + * the corresponding bit in the bitmap. It only clears the accessed bit if + * the old value is 1. A caller can test or test and clear the accessed bit + * by setting the corresponding bit in the bitmap to 0 or 1, and the new + * value will be 1 or 0 for a young KVM PTE. + */ +static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool fallback, unsigned long *bitmap) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap); + + return 0; +} + static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { @@ -687,6 +720,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool fallback, unsigned long *bitmap) +{ + return 0; +} + static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4..5eeca358d043 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -312,7 +313,7 @@ enum lruvec_flags { * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the @@ -324,8 +325,8 @@ enum lruvec_flags { * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->lists[] so that - * the aging needs not to worry about it. And it's set again when a page + * PG_active is always cleared while a page is on one of lrugen->folios[] so + * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * @@ -378,6 +379,7 @@ enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, + LRU_GEN_SPTE_WALK, NR_LRU_GEN_CAPS }; @@ -404,7 +406,7 @@ enum { * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ -struct lru_gen_struct { +struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ @@ -412,7 +414,7 @@ struct lru_gen_struct { /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ @@ -426,6 +428,14 @@ struct lru_gen_struct { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; +#ifdef CONFIG_MEMCG + /* the memcg generation this lru_gen_folio belongs to */ + u8 gen; + /* the list segment this lru_gen_folio belongs to */ + u8 seg; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_node list; +#endif }; enum { @@ -444,24 +454,20 @@ enum { struct lru_gen_mm_state { /* set to max_seq after each iteration */ unsigned long seq; - /* where the current iteration continues (inclusive) */ + /* where the current iteration continues after */ struct list_head *head; - /* where the last iteration ended (exclusive) */ + /* where the last iteration ended before */ struct list_head *tail; - /* to wait for the last page table walker to finish */ - struct wait_queue_head wait; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; - /* the number of concurrent page table walkers */ - int nr_walkers; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_struct */ + /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; @@ -476,24 +482,101 @@ struct lru_gen_mm_walk { }; void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG + +/* + * For each node, memcgs are divided into two generations: the old and the + * young. For each generation, memcgs are randomly sharded into multiple bins + * to improve scalability. For each bin, the hlist_nulls is virtually divided + * into three segments: the head, the tail and the default. + * + * An onlining memcg is added to the tail of a random bin in the old generation. + * The eviction starts at the head of a random bin in the old generation. The + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes + * the old generation, is incremented when all its bins become empty. + * + * There are four operations: + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * current generation (old or young) and updates its "seg" to "head"; + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * current generation (old or young) and updates its "seg" to "tail"; + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * generation, updates its "gen" to "old" and resets its "seg" to "default"; + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * young generation, updates its "gen" to "young" and resets its "seg" to + * "default". + * + * The events that trigger the above operations are: + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; + * 2. The first attempt to reclaim an memcg below low, which triggers + * MEMCG_LRU_TAIL; + * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * + * Note that memcg LRU only applies to global reclaim, and the round-robin + * incrementing of their max_seq counters ensures the eventual fairness to all + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + */ +#define MEMCG_NR_GENS 2 +#define MEMCG_NR_BINS 8 + +struct lru_gen_memcg { + /* the per-node memcg generation counter */ + unsigned long seq; + /* each memcg has one lru_gen_folio per node */ + unsigned long nr_memcgs[MEMCG_NR_GENS]; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; + /* protects the above */ + spinlock_t lock; +}; + +void lru_gen_init_pgdat(struct pglist_data *pgdat); + void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); -#endif +void lru_gen_online_memcg(struct mem_cgroup *memcg); +void lru_gen_offline_memcg(struct mem_cgroup *memcg); +void lru_gen_release_memcg(struct mem_cgroup *memcg); +void lru_gen_soft_reclaim(struct lruvec *lruvec); + +#else /* !CONFIG_MEMCG */ + +#define MEMCG_NR_GENS 1 + +struct lru_gen_memcg { +}; + +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + +#endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { + return false; } #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -501,7 +584,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } -#endif + +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ +} + +#endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ @@ -524,7 +624,7 @@ struct lruvec { unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; + struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif @@ -1242,7 +1342,9 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ - struct lru_gen_mm_walk mm_walk; + struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 022573f49957..110a36479dce 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -146,6 +146,13 @@ struct maple_subtree_state { struct maple_big_node *bn; }; +#ifdef CONFIG_KASAN_STACK +/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ +#define noinline_for_kasan noinline_for_stack +#else +#define noinline_for_kasan inline +#endif + /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { @@ -182,7 +189,6 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } - static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; @@ -467,7 +473,7 @@ static inline void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { - unsigned long val = (unsigned long) parent; + unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); @@ -501,10 +507,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, */ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { - unsigned long val = (unsigned long) mte_to_node(enode)->parent; + unsigned long val = (unsigned long)mte_to_node(enode)->parent; - /* Root. */ - if (val & 1) + if (val & MA_ROOT_PARENT) return 0; /* @@ -1298,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; @@ -1331,8 +1331,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); - return; - } /* @@ -1952,10 +1950,9 @@ static inline int mab_calc_split(struct ma_state *mas, /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); - if (!(*mid_split)) - return split; - *mid_split = mab_no_null_split(bn, *mid_split, slot_count); + if (unlikely(*mid_split)) + *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } @@ -2178,7 +2175,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, * * Return: The actual end of the data stored in @b_node */ -static inline void mas_store_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; @@ -2315,9 +2312,7 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; - unsigned char count; - unsigned char offset; - unsigned long index, min, max; + unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; @@ -2330,34 +2325,12 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; - min = mas_safe_min(mas, wr_mas->pivots, offset); - if (unlikely(offset == count)) - goto max; - - max = wr_mas->pivots[offset]; - index = mas->index; - if (unlikely(index <= max)) - goto done; - - if (unlikely(!max && offset)) - goto max; - - min = max + 1; - while (++offset < count) { - max = wr_mas->pivots[offset]; - if (index <= max) - goto done; - else if (unlikely(!max)) - break; - min = max + 1; - } + while (offset < count && mas->index > wr_mas->pivots[offset]) + offset++; -max: - max = mas->max; -done: - wr_mas->r_max = max; - wr_mas->r_min = min; + wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; + wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } @@ -3012,7 +2985,7 @@ static inline void *mtree_range_walk(struct ma_state *mas) mas->min = prev_min; mas->max = prev_max; mas->node = last; - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -3285,7 +3258,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end if (tmp < max_p) memset(pivs + tmp, 0, - sizeof(unsigned long *) * (max_p - tmp)); + sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); @@ -3532,7 +3505,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, */ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) { - struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; @@ -3651,7 +3623,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, * @b_node: The maple big node * @end: The end of the data. */ -static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; @@ -3952,7 +3924,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) goto dead_node; } while (!ma_is_leaf(type)); - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -4788,15 +4760,11 @@ static inline void *mas_next_nentry(struct ma_state *mas, static inline void mas_rewalk(struct ma_state *mas, unsigned long index) { - retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; - - return; - } /* @@ -4973,7 +4941,8 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) * Return: True if found in a leaf, false otherwise. * */ -static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) +static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, + unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); @@ -5038,8 +5007,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) if (unlikely(ma_is_leaf(type))) { mas->offset = offset; - mas->min = min; - mas->max = min + gap - 1; + *gap_min = min; + *gap_max = min + gap - 1; return true; } @@ -5063,10 +5032,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; - unsigned char offset; - unsigned long *gaps; - unsigned long *pivots = ma_pivots(mas_mn(mas), type); - void __rcu **slots = ma_slots(mas_mn(mas), type); + unsigned char offset, data_end; + unsigned long *gaps, *pivots; + void __rcu **slots; + struct maple_node *node; bool found = false; if (ma_is_dense(type)) { @@ -5074,13 +5043,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) return true; } - gaps = ma_gaps(mte_to_node(mas->node), type); + node = mas_mn(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); - for (; offset < mt_slots[type]