From 2dc555f46d44509aa9ac70dad7cc5c4de6067219 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sun, 15 Jan 2023 16:50:23 +0100 Subject: [PATCH 01/17] bbr2 Signed-off-by: Peter Jung --- include/linux/tcp.h | 3 +- include/net/inet_connection_sock.h | 3 +- include/net/tcp.h | 41 +- include/uapi/linux/inet_diag.h | 33 + net/ipv4/Kconfig | 22 + net/ipv4/Makefile | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_bbr.c | 38 +- net/ipv4/tcp_bbr2.c | 2674 ++++++++++++++++++++++++++++ net/ipv4/tcp_cong.c | 1 + net/ipv4/tcp_input.c | 27 +- net/ipv4/tcp_output.c | 26 +- net/ipv4/tcp_rate.c | 30 +- net/ipv4/tcp_timer.c | 1 + 14 files changed, 2867 insertions(+), 34 deletions(-) create mode 100644 net/ipv4/tcp_bbr2.c diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41b1da621a45..d8f94ef1a297 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -255,7 +255,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + fast_ack_mode:2, /* which fast ack mode ? */ + unused:3; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..d85858efa571 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -135,7 +135,8 @@ struct inet_connection_sock { u32 icsk_probes_tstamp; u32 icsk_user_timeout; - u64 icsk_ca_priv[104 / sizeof(u64)]; +/* XXX inflated by temporary internal debugging info */ + u64 icsk_ca_priv[216 / sizeof(u64)]; #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b70b241ce71..4ae0f55cf0e1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 +#define TCP_ECN_ECT_PERMANENT 16 enum tcp_tw_status { TCP_TW_SUCCESS = 0, @@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } +static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) +{ + return max_t(s32, t1 - t0, 0); +} + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); @@ -898,9 +904,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ - u64 first_tx_mstamp; + u32 first_tx_mstamp; /* when we reached the "delivered" count */ - u64 delivered_mstamp; + u32 delivered_mstamp; +#define TCPCB_IN_FLIGHT_BITS 20 +#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) + u32 in_flight:20, /* packets in flight at transmit */ + unused2:12; + u32 lost; /* packets lost so far upon tx of skb */ } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) +/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ +#define TCP_CONG_WANTS_CE_EVENTS 0x4 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ + TCP_CONG_NEEDS_ECN | \ + TCP_CONG_WANTS_CE_EVENTS) union tcp_cc_info; @@ -1046,8 +1061,11 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_lost; /* tp->lost at "prior_mstamp" */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 lost; /* number of packets lost over interval */ s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ @@ -1061,6 +1079,7 @@ struct rate_sample { bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ + bool is_ece; /* did this ACK have ECN marked? */ }; struct tcp_congestion_ops { @@ -1084,8 +1103,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - u32 (*min_tso_segs)(struct sock *sk); + /* pick target number of segments per TSO/GSO skb (optional): */ + u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); + + /* react to a specific lost skb (optional) */ + void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) @@ -1148,6 +1170,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif +static inline bool tcp_ca_wants_ce_events(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | + TCP_CONG_WANTS_CE_EVENTS); +} + static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1167,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 50655de04c9b..0e24f11627d5 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -231,9 +231,42 @@ struct tcp_bbr_info { __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ }; +/* Phase as reported in netlink/ss stats. */ +enum tcp_bbr2_phase { + BBR2_PHASE_INVALID = 0, + BBR2_PHASE_STARTUP = 1, + BBR2_PHASE_DRAIN = 2, + BBR2_PHASE_PROBE_RTT = 3, + BBR2_PHASE_PROBE_BW_UP = 4, + BBR2_PHASE_PROBE_BW_DOWN = 5, + BBR2_PHASE_PROBE_BW_CRUISE = 6, + BBR2_PHASE_PROBE_BW_REFILL = 7 +}; + +struct tcp_bbr2_info { + /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ + __u32 bbr_bw_lsb; /* lower 32 bits of bw */ + __u32 bbr_bw_msb; /* upper 32 bits of bw */ + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ + __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ + __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ + __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ + __u8 bbr_mode; /* current bbr_mode in state machine */ + __u8 bbr_phase; /* current state machine phase */ + __u8 unused1; /* alignment padding; not used yet */ + __u8 bbr_version; /* MUST be at this offset in struct */ + __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ + __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ + __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ +}; + union tcp_cc_info { struct tcpvegas_info vegas; struct tcp_dctcp_info dctcp; struct tcp_bbr_info bbr; + struct tcp_bbr2_info bbr2; }; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 2dfb12230f08..b6bec331a82e 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -678,6 +678,24 @@ config TCP_CONG_BBR AQM schemes that do not provide a delay signal. It requires the fq ("Fair Queue") pacing packet scheduler. +config TCP_CONG_BBR2 + tristate "BBR2 TCP" + default n + help + + BBR2 TCP congestion control is a model-based congestion control + algorithm that aims to maximize network utilization, keep queues and + retransmit rates low, and to be able to coexist with Reno/CUBIC in + common scenarios. It builds an explicit model of the network path. It + tolerates a targeted degree of random packet loss and delay that are + unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, + or cable modem links, and can use DCTCP-L4S-style ECN signals. It can + coexist with flows that use loss-based congestion control, and can + operate with shallow buffers, deep buffers, bufferbloat, policers, or + AQM schemes that do not provide a delay signal. It requires pacing, + using either TCP internal pacing or the fq ("Fair Queue") pacing packet + scheduler. + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -715,6 +733,9 @@ choice config DEFAULT_BBR bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_BBR2 + bool "BBR2" if TCP_CONG_BBR2=y + config DEFAULT_RENO bool "Reno" endchoice @@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG default "bbr" if DEFAULT_BBR + default "bbr2" if DEFAULT_BBR2 default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bbdd9c44f14e..8dee1547d820 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o +obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6667c3538f2a..34207ec358d8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3190,6 +3190,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; + tp->fast_ack_mode = 0; /* Clean up fastopen related fields */ diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 54eec33c6e1c..bfbf158c71f4 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* override sysctl_tcp_min_tso_segs */ static u32 bbr_min_tso_segs(struct sock *sk) { return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + u32 segs; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 segs, bytes; - - /* Sort of tcp_tso_autosize() but ignoring - * driver provided sk_gso_max_size. - */ - bytes = min_t(unsigned long, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); - segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - return min(segs, 0x7FU); + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .min_tso_segs = bbr_min_tso_segs, + .tso_segs = bbr_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c new file mode 100644 index 000000000000..85f8052144d1 --- /dev/null +++ b/net/ipv4/tcp_bbr2.c @@ -0,0 +1,2674 @@ +/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 + * + * BBRv2 is a model-based congestion control algorithm that aims for low + * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model + * of the network path, it uses measurements of bandwidth and RTT, as well as + * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that + * although it can use ECN or loss signals explicitly, it does not require + * either; it can bound its in-flight data based on its estimate of the BDP. + * + * The model has both higher and lower bounds for the operating range: + * lo: bw_lo, inflight_lo: conservative short-term lower bound + * hi: bw_hi, inflight_hi: robust long-term upper bound + * The bandwidth-probing time scale is (a) extended dynamically based on + * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by + * an interactive wall-clock time-scale to be more scalable and responsive + * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * + * | + * V + * +---> STARTUP ----+ + * | | | + * | V | + * | DRAIN ----+ + * | | | + * | V | + * +---> PROBE_BW ----+ + * | ^ | | + * | | | | + * | +----+ | + * | | + * +---- PROBE_RTT <--+ + * + * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. + * When it estimates the pipe is full, it enters DRAIN to drain the queue. + * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. + * A long-lived BBR flow spends the vast majority of its time remaining + * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth + * in a fair manner, with a small, bounded queue. *If* a flow has been + * continuously sending for the entire min_rtt window, and hasn't seen an RTT + * sample that matches or decreases its min_rtt estimate for 10 seconds, then + * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe + * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if + * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; + * otherwise we enter STARTUP to try to fill the pipe. + * + * BBR is described in detail in: + * "BBR: Congestion-Based Congestion Control", + * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, + * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. + * + * There is a public e-mail list for discussing BBR development and testing: + * https://groups.google.com/forum/#!forum/bbr-dev + * + * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, + * otherwise TCP stack falls back to an internal pacing using one high + * resolution timer per TCP socket and may use more resources. + */ +#include +#include +#include +#include +#include + +#include "tcp_dctcp.h" + +/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. + * Since the minimum window is >=4 packets, the lower bound isn't + * an issue. The upper bound isn't an issue with existing technologies. + */ +#define BW_SCALE 24 +#define BW_UNIT (1 << BW_SCALE) + +#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ +#define BBR_UNIT (1 << BBR_SCALE) + +#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ +#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ + +#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ + +/* BBR has the following modes for deciding how fast to send: */ +enum bbr_mode { + BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ + BBR_DRAIN, /* drain any queue created during startup */ + BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ +}; + +/* How does the incoming ACK stream relate to our bandwidth probing? */ +enum bbr_ack_phase { + BBR_ACKS_INIT, /* not probing; not getting probe feedback */ + BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ + BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ + BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ + BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ +}; + +/* BBR congestion control block */ +struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ + u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ + u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ + u64 cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ + ce_state:1, /* If most recent data has CE bit set */ + bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ + try_fast_path:1, /* can we take fast path? */ + unused2:11, + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ + cycle_idx:3, /* current index in pacing_gain cycle array */ + has_seen_rtt:1; /* have we seen an RTT sample yet? */ + u32 pacing_gain:11, /* current gain for setting pacing rate */ + cwnd_gain:11, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ + init_cwnd:7; /* initial cwnd */ + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + /* BBR v2 state: */ + unused1:2, + startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ + loss_in_cycle:1, /* packet loss in this cycle? */ + ecn_in_cycle:1; /* ECN in this cycle? */ + u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ + u32 undo_bw_lo; /* bw_lo before latest losses */ + u32 undo_inflight_lo; /* inflight_lo before latest losses */ + u32 undo_inflight_hi; /* inflight_hi before latest losses */ + u32 bw_latest; /* max delivered bw in last round trip */ + u32 bw_lo; /* lower bound on sending bandwidth */ + u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ + u32 inflight_latest; /* max delivered data in last round trip */ + u32 inflight_lo; /* lower bound of inflight data range */ + u32 inflight_hi; /* upper bound of inflight data range */ + u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ + u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ + u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ + u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ + ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ + bw_probe_samples:1, /* rate samples reflect bw probing? */ + prev_probe_too_high:1, /* did last PROBE_UP go too high? */ + stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ + rounds_since_probe:8, /* packet-timed rounds since probed bw */ + loss_round_start:1, /* loss_round_delivered round trip? */ + loss_in_round:1, /* loss marked in this round trip? */ + ecn_in_round:1, /* ECN marked in this round trip? */ + ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ + loss_events_in_round:4,/* losses in STARTUP round */ + initialized:1; /* has bbr_init() been called? */ + u32 alpha_last_delivered; /* tp->delivered at alpha update */ + u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ + + /* Params configurable using setsockopt. Refer to correspoding + * module param for detailed description of params. + */ + struct bbr_params { + u32 high_gain:11, /* max allowed value: 2047 */ + drain_gain:10, /* max allowed value: 1023 */ + cwnd_gain:11; /* max allowed value: 2047 */ + u32 cwnd_min_target:4, /* max allowed value: 15 */ + min_rtt_win_sec:5, /* max allowed value: 31 */ + probe_rtt_mode_ms:9, /* max allowed value: 511 */ + full_bw_cnt:3, /* max allowed value: 7 */ + cwnd_tso_budget:1, /* allowed values: {0, 1} */ + unused3:6, + drain_to_target:1, /* boolean */ + precise_ece_ack:1, /* boolean */ + extra_acked_in_startup:1, /* allowed values: {0, 1} */ + fast_path:1; /* boolean */ + u32 full_bw_thresh:10, /* max allowed value: 1023 */ + startup_cwnd_gain:11, /* max allowed value: 2047 */ + bw_probe_pif_gain:9, /* max allowed value: 511 */ + usage_based_cwnd:1, /* boolean */ + unused2:1; + u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ + refill_add_inc:2; /* max allowed value: 3 */ + u16 extra_acked_gain:11, /* max allowed value: 2047 */ + extra_acked_win_rtts:5; /* max allowed value: 31*/ + u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ + /* Mostly BBR v2 parameters below here: */ + u32 ecn_alpha_gain:8, /* max allowed value: 255 */ + ecn_factor:8, /* max allowed value: 255 */ + ecn_thresh:8, /* max allowed value: 255 */ + beta:8; /* max allowed value: 255 */ + u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ + bw_probe_reno_gain:9, /* max allowed value: 511 */ + full_loss_cnt:4; /* max allowed value: 15 */ + u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ + inflight_headroom:8, /* max allowed value: 255 */ + loss_thresh:8, /* max allowed value: 255 */ + bw_probe_max_rounds:8; /* max allowed value: 255 */ + u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ + bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ + full_ecn_cnt:2; /* max allowed value: 3 */ + u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ + undo:1, /* boolean */ + tso_rtt_shift:4, /* max allowed value: 15 */ + unused5:1; + u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ + unused1:14, + ecn_alpha_init:9; /* max allowed value: 256 */ + } params; + + struct { + u32 snd_isn; /* Initial sequence number */ + u32 rs_bw; /* last valid rate sample bw */ + u32 target_cwnd; /* target cwnd, based on BDP */ + u8 undo:1, /* Undo even happened but not yet logged */ + unused:7; + char event; /* single-letter event debug codes */ + u16 unused2; + } debug; +}; + +struct bbr_context { + u32 sample_bw; + u32 target_cwnd; + u32 log:1; +}; + +/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ +static u32 bbr_min_rtt_win_sec = 10; +/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. + * Max allowed value is 511 (0x1FF). + */ +static u32 bbr_probe_rtt_mode_ms = 200; +/* Window length of probe_rtt_min_us filter (in ms), and consequently the + * typical interval between PROBE_RTT mode entries. + * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC + */ +static u32 bbr_probe_rtt_win_ms = 5000; +/* Skip TSO below the following bandwidth (bits/sec): */ +static int bbr_min_tso_rate = 1200000; + +/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting + * in bigger TSO bursts. By default we cut the RTT-based allowance in half + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance + * is below 1500 bytes after 6 * ~500 usec = 3ms. + */ +static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ + +/* Select cwnd TSO budget approach: + * 0: padding + * 1: flooring + */ +static uint bbr_cwnd_tso_budget = 1; + +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ +static const int bbr_pacing_margin_percent = 1; + +/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). + */ +static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ +static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round. Max allowed value + * is 1023 (0x3FF). + */ +static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. + * Max allowed value is 2047 (0x7FF). + */ +static int bbr_cwnd_gain = BBR_UNIT * 2; +/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. + * Max allowed value for each element is 1023 (0x3FF). + */ +enum bbr_pacing_gain_phase { + BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ + BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ + BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ + BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ +}; +static int bbr_pacing_gain[] = { + BBR_UNIT * 5 / 4, /* probe for more available bw */ + BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ + BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ + BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ +}; + +/* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet + * needs at least 4 packets in flight. Max allowed value is 15 (0xF). + */ +static u32 bbr_cwnd_min_target = 4; + +/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. + * Use 0 to disable. Max allowed value is 255. + */ +static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; + +/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* If bw has increased significantly (1.25x), there may be more bw available. + * Max allowed value is 1023 (0x3FF). + */ +static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; +/* But after 3 rounds w/o significant bw growth, estimate pipe is full. + * Max allowed value is 7 (0x7). + */ +static u32 bbr_full_bw_cnt = 3; + +static u32 bbr_flags; /* Debugging related stuff */ + +/* Whether to debug using printk. + */ +static bool bbr_debug_with_printk; + +/* Whether to debug using ftrace event tcp:tcp_bbr_event. + * Ignored when bbr_debug_with_printk is set. + */ +static bool bbr_debug_ftrace; + +/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ +static bool bbr_drain_to_target = true; /* default: enabled */ + +/* Experiment: Flags to control BBR with ECN behavior. + */ +static bool bbr_precise_ece_ack = true; /* default: enabled */ + +/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is + * (2^(16+14) B)/(1024 B/packet) = 1M packets. + */ +static u32 bbr_cwnd_warn_val = 1U << 20; + +static u16 bbr_debug_port_mask; + +/* BBR module parameters. These are module parameters only in Google prod. + * Upstream these are intentionally not module parameters. + */ +static int bbr_pacing_gain_size = CYCLE_LEN; + +/* Gain factor for adding extra_acked to target cwnd: */ +static int bbr_extra_acked_gain = 256; + +/* Window length of extra_acked window. Max allowed val is 31. */ +static u32 bbr_extra_acked_win_rtts = 5; + +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + +/* Time period for clamping cwnd increment due to ack aggregation */ +static u32 bbr_extra_acked_max_us = 100 * 1000; + +/* Use extra acked in startup ? + * 0: disabled + * 1: use latest extra_acked value from 1-2 rtt in startup + */ +static int bbr_extra_acked_in_startup = 1; /* default: enabled */ + +/* Experiment: don't grow cwnd beyond twice of what we just probed. */ +static bool bbr_usage_based_cwnd; /* default: disabled */ + +/* For lab testing, researchers can enable BBRv2 ECN support with this flag, + * when they know that any ECN marks that the connections experience will be + * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. + * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on + * negotiation or configuration that is outside the scope of the BBRv2 + * alpha release. + */ +static bool bbr_ecn_enable = false; + +module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); +module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); +module_param_named(high_gain, bbr_high_gain, int, 0644); +module_param_named(drain_gain, bbr_drain_gain, int, 0644); +module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); +module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); +module_param_array_named(pacing_gain, bbr_pacing_gain, int, + &bbr_pacing_gain_size, 0644); +module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); +module_param_named(probe_rtt_cwnd_gain, + bbr_probe_rtt_cwnd_gain, uint, 0664); +module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); +module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); +module_param_named(flags, bbr_flags, uint, 0644); +module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); +module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); +module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); +module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); +module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); +module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); +module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); +module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); +module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); +module_param_named(extra_acked_win_rtts, + bbr_extra_acked_win_rtts, uint, 0664); +module_param_named(extra_acked_max_us, + bbr_extra_acked_max_us, uint, 0664); +module_param_named(ack_epoch_acked_reset_thresh, + bbr_ack_epoch_acked_reset_thresh, uint, 0664); +module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); +module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); +module_param_named(extra_acked_in_startup, + bbr_extra_acked_in_startup, int, 0664); +module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); +module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); + +static void bbr2_exit_probe_rtt(struct sock *sk); +static void bbr2_reset_congestion_signals(struct sock *sk); + +static void bbr_check_probe_rtt_done(struct sock *sk); + +/* Do we estimate that STARTUP filled the pipe? */ +static bool bbr_full_bw_reached(const struct sock *sk) +{ + const struct bbr *bbr = inet_csk_ca(sk); + + return bbr->full_bw_reached; +} + +/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ +static u32 bbr_max_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->bw_hi[0], bbr->bw_hi[1]); +} + +/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ +static u32 bbr_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return min(bbr_max_bw(sk), bbr->bw_lo); +} + +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + +/* Return rate in bytes per second, optionally with a gain. + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, + int margin) +{ + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; + rate *= USEC_PER_SEC / 100 * (100 - margin); + rate >>= BW_SCALE; + rate = max(rate, 1ULL); + return rate; +} + +static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) +{ + return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); +} + +static u64 bbr_rate_kbps(struct sock *sk, u64 rate) +{ + rate = bbr_bw_bytes_per_sec(sk, rate); + rate *= 8; + do_div(rate, 1000); + return rate; +} + +static u32 bbr_tso_segs_goal(struct sock *sk); +static void bbr_debug(struct sock *sk, u32 acked, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + static const char ca_states[] = { + [TCP_CA_Open] = 'O', + [TCP_CA_Disorder] = 'D', + [TCP_CA_CWR] = 'C', + [TCP_CA_Recovery] = 'R', + [TCP_CA_Loss] = 'L', + }; + static const char mode[] = { + 'G', /* Growing - BBR_STARTUP */ + 'D', /* Drain - BBR_DRAIN */ + 'W', /* Window - BBR_PROBE_BW */ + 'M', /* Min RTT - BBR_PROBE_RTT */ + }; + static const char ack_phase[] = { /* bbr_ack_phase strings */ + 'I', /* BBR_ACKS_INIT - 'Init' */ + 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ + 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ + 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ + 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ + }; + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + const u32 una = tp->snd_una - bbr->debug.snd_isn; + const u32 fack = tcp_highest_sack_seq(tp); + const u16 dport = ntohs(inet_sk(sk)->inet_dport); + bool is_port_match = (bbr_debug_port_mask && + ((dport & bbr_debug_port_mask) == 0)); + char debugmsg[320]; + + if (sk->sk_state == TCP_SYN_SENT) + return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ + + if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { + char addr[INET6_ADDRSTRLEN + 10] = { 0 }; + + if (sk->sk_family == AF_INET) + snprintf(addr, sizeof(addr), "%pI4:%u", + &inet_sk(sk)->inet_daddr, dport); + else if (sk->sk_family == AF_INET6) + snprintf(addr, sizeof(addr), "%pI6:%u", + &sk->sk_v6_daddr, dport); + + WARN_ONCE(1, + "BBR %s cwnd alert: %u " + "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " + "bw: %u rtt: %u min_rtt: %u " + "acked: %u tso_segs: %u " + "bw: %d %ld %d pif: %u\n", + addr, tp->snd_cwnd, + una, inet_csk(sk)->icsk_ca_state, + bbr->pacing_gain, bbr->cwnd_gain, + bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, + acked, bbr_tso_segs_goal(sk), + rs->delivered, rs->interval_us, rs->is_retrans, + tcp_packets_in_flight(tp)); + } + + if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) + return; + + if (!sock_flag(sk, SOCK_DBG) && !is_port_match) + return; + + if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) + return; + + if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && + !(bbr_flags & FLAG_DEBUG_LOOPBACK)) + return; + + snprintf(debugmsg, sizeof(debugmsg) - 1, + "BBR %pI4:%-5u %5u,%03u:%-7u %c " + "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " + "bw %llu lb %llu ib %llu qb %llu " + "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " + "lr %d er %d ea %d bwl %lld il %d ih %d c %d " + "v %d %c %u %c %s\n", + &inet_sk(sk)->inet_daddr, dport, + una / 1000, una % 1000, fack - tp->snd_una, + ca_states[inet_csk(sk)->icsk_ca_state], + bbr->debug.undo ? '@' : mode[bbr->mode], + tp->snd_cwnd, + bbr_extra_acked(sk), /* br (legacy): extra_acked */ + rs->tx_in_flight, /* cr (legacy): tx_inflight */ + rs->rtt_us, + rs->delivered, + rs->interval_us, + bbr->min_rtt_us, + rs->is_app_limited ? '_' : 'l', + bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ + bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ + 0ULL, /* lb: [obsolete] */ + 0ULL, /* ib: [obsolete] */ + div_u64((u64)sk->sk_pacing_rate * 8, 1000), + acked, + tcp_packets_in_flight(tp), + rs->is_ack_delayed ? 'd' : '.', + bbr->round_start ? '*' : '.', + tp->delivered, tp->lost, + tp->app_limited, + 0, /* #: [obsolete] */ + ctx->target_cwnd, + tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ + ca_states[bbr->prev_ca_state], + (rs->lost + rs->delivered) > 0 ? + (1000 * rs->lost / + (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ + (rs->delivered) > 0 ? + (1000 * rs->delivered_ce / + (rs->delivered)) : 0, /* er: ECN rate x1000 */ + 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ + bbr->bw_lo == ~0U ? + -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ + bbr->inflight_lo, /* il */ + bbr->inflight_hi, /* ih */ + bbr->bw_probe_up_cnt, /* c */ + 2, /* v: version */ + bbr->debug.event, + bbr->cycle_idx, + ack_phase[bbr->ack_phase], + bbr->bw_probe_samples ? "Y" : "N"); + debugmsg[sizeof(debugmsg) - 1] = 0; + + /* printk takes a higher precedence. */ + if (bbr_debug_with_printk) + printk(KERN_DEBUG "%s", debugmsg); + + if (unlikely(bbr->debug.undo)) + bbr->debug.undo = 0; +} + +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain, + bbr_pacing_margin_percent); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); +} + +/* Pace using current bw estimate and a gain factor. */ +static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); + + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) + sk->sk_pacing_rate = rate; +} + +static u32 bbr_min_tso_segs(struct sock *sk) +{ + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; +} + +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 segs, r; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + /* Budget a TSO/GSO burst size allowance based on min_rtt. For every + * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. + * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) + */ + if (bbr->params.tso_rtt_shift) { + r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; + if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ + bytes += GSO_MAX_SIZE >> r; + } + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ +static u32 bbr_tso_segs_goal(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); +} + +/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +static void bbr_save_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) + bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ + bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); +} + +static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (event == CA_EVENT_TX_START && tp->app_limited) { + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + /* Avoid pointless buffer overflows: pace at est. bw if we don't + * need more speed (we're restarting from idle and app-limited). + */ + if (bbr->mode == BBR_PROBE_BW) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); + } else if ((event == CA_EVENT_ECN_IS_CE || + event == CA_EVENT_ECN_NO_CE) && + bbr_ecn_enable && + bbr->params.precise_ece_ack) { + u32 state = bbr->ce_state; + dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); + bbr->ce_state = state; + if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + } +} + +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: + * + * bdp = ceil(bw * min_rtt * gain) + * + * The key factor, gain, controls the amount of queue. While a small gain + * builds a smaller queue, it becomes more vulnerable to noise in RTT + * measurements (e.g., delayed ACKs or other ACK compression effects). This + * noise may cause BBR to under-estimate the rate. + */ +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bdp; + u64 w; + + /* If we've never had a valid RTT sample, cap cwnd at the initial + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which + * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + + /* Apply a gain to the given value, remove the BW_SCALE shift, and + * round the value up to avoid a negative feedback loop. + */ + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 tso_segs_goal; + + tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + + /* Allow enough full-sized skbs in flight to utilize end systems. */ + if (bbr->params.cwnd_tso_budget == 1) { + cwnd = max_t(u32, cwnd, tso_segs_goal); + cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); + } else { + cwnd += tso_segs_goal; + cwnd = (cwnd + 1) & ~1U; + } + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +} + +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight); + + return inflight; +} + +/* With pacing at lower layers, there's often less data "in the network" than + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), + * we often have several skbs queued in the pacing layer with a pre-scheduled + * earliest departure time (EDT). BBR adapts its pacing rate based on the + * inflight level that it estimates has already been "baked in" by previous + * departure time decisions. We calculate a rough estimate of the number of our + * packets that might be in the network at the earliest departure time for the + * next skb scheduled: + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw + * If we're increasing inflight, then we want to know if the transmit of the + * EDT skb will push inflight above the target, so inflight_at_edt includes + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, + * then estimate if inflight will sink too low just before the EDT transmit. + */ +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 now_ns, edt_ns, interval_us; + u32 interval_delivered, inflight_at_edt; + + now_ns = tp->tcp_clock_cache; + edt_ns = max(tp->tcp_wstamp_ns, now_ns); + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; + inflight_at_edt = inflight_now; + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ + if (interval_delivered >= inflight_at_edt) + return 0; + return inflight_at_edt - interval_delivered; +} + +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr->params.extra_acked_gain && + (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + +/* Returns the cwnd for PROBE_RTT mode. */ +static u32 bbr_probe_rtt_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->params.probe_rtt_cwnd_gain == 0) + return bbr->params.cwnd_min_target; + return max_t(u32, bbr->params.cwnd_min_target, + bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); +} + +/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ +static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + u32 acked, u32 bw, int gain, u32 cwnd, + struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ + bbr->debug.target_cwnd = target_cwnd; + + /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ + bbr->try_fast_path = 0; + if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ + cwnd += acked; + if (cwnd >= target_cwnd) { + cwnd = target_cwnd; + bbr->try_fast_path = 1; + } + } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { + cwnd += acked; + } else { + bbr->try_fast_path = 1; + } + + /* When growing cwnd, don't grow beyond twice what we just probed. */ + if (bbr->params.usage_based_cwnd) { + max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); + cwnd = min(cwnd, max_probe); + } + + cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); +done: + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); + + ctx->target_cwnd = target_cwnd; + ctx->log = (tp->snd_cwnd != prev_cwnd); +} + +/* See if we have reached next round trip */ +static void bbr_update_round_start(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->round_start = 0; + + /* See if we've reached the next RTT */ + if (rs->interval_us > 0 && + !before(rs->prior_delivered, bbr->next_rtt_delivered)) { + bbr->next_rtt_delivered = tp->delivered; + bbr->round_start = 1; + } +} + +/* Calculate the bandwidth based on how fast packets are delivered */ +static void bbr_calculate_bw_sample(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + * Round up to allow growth at low rates, even with integer division. + */ + if (rs->interval_us > 0) { + if (WARN_ONCE(rs->delivered < 0, + "negative delivered: %d interval_us: %ld\n", + rs->delivered, rs->interval_us)) + return; + + bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } + + ctx->sample_bw = bw; + bbr->debug.rs_bw = bw; +} + +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips for non-startup phase, and 1-2 round trips for startup. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; + + if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->params.extra_acked_in_startup && + !bbr_full_bw_reached(sk)) + extra_acked_win_rtts_thresh = 1; + if (bbr->extra_acked_win_rtts >= + extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh; + + if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) + return; + + bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; + if (bbr_max_bw(sk) >= bw_thresh) { + bbr->full_bw = bbr_max_bw(sk); + bbr->full_bw_cnt = 0; + return; + } + ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + tcp_sk(sk)->snd_ssthresh = + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); + bbr2_reset_congestion_signals(sk); + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) + return true; /* exiting DRAIN now */ + return false; +} + +static void bbr_check_probe_rtt_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (!(bbr->probe_rtt_done_stamp && + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + + bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr2_exit_probe_rtt(sk); +} + +/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and + * periodically drain the bottleneck queue, to converge to measure the true + * min_rtt (unloaded propagation delay). This allows the flows to keep queues + * small (reducing queuing delay and packet loss) and achieve fairness among + * BBR flows. + * + * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, + * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed + * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * re-enter the previous mode. BBR uses 200ms to approximately bound the + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * + * Note that flows need only pay 2% if they are busy sending over the last 10 + * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have + * natural silences or low-rate periods within 10 seconds where the rate is low + * enough for long enough to drain its queue in the bottleneck. We pick up + * these min RTT measurements opportunistically with our min_rtt filter. :-) + */ +static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool probe_rtt_expired, min_rtt_expired; + u32 expire; + + /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ + expire = bbr->probe_rtt_min_stamp + + msecs_to_jiffies(bbr->params.probe_rtt_win_ms); + probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->probe_rtt_min_us || + (probe_rtt_expired && !rs->is_ack_delayed))) { + bbr->probe_rtt_min_us = rs->rtt_us; + bbr->probe_rtt_min_stamp = tcp_jiffies32; + } + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; + min_rtt_expired = after(tcp_jiffies32, expire); + if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || + min_rtt_expired) { + bbr->min_rtt_us = bbr->probe_rtt_min_us; + bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + + if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { + /* Ignore low rate samples during this mode. */ + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + + msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done) + bbr_check_probe_rtt_done(sk); + } + } + /* Restart after idle ends only once we process a new S/ACK for data */ + if (rs->delivered > 0) + bbr->idle_restart = 0; +} + +static void bbr_update_gains(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + switch (bbr->mode) { + case BBR_STARTUP: + bbr->pacing_gain = bbr->params.high_gain; + bbr->cwnd_gain = bbr->params.startup_cwnd_gain; + break; + case BBR_DRAIN: + bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ + bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ + break; + case BBR_PROBE_BW: + bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; + bbr->cwnd_gain = bbr->params.cwnd_gain; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + break; + default: + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); + break; + } +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + int i; + + WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); + + bbr->initialized = 1; + bbr->params.high_gain = min(0x7FF, bbr_high_gain); + bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); + bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); + bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); + bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); + bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); + bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); + bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); + bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); + bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); + bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); + bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); + bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; + bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; + bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; + bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); + bbr->params.probe_rtt_win_ms = + min(0x3FFFU, + min_t(u32, bbr_probe_rtt_win_ms, + bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); + for (i = 0; i < CYCLE_LEN; i++) + bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); + bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; + bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); + + bbr->debug.snd_isn = tp->snd_una; + bbr->debug.target_cwnd = 0; + bbr->debug.undo = 0; + + bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); + bbr->prior_cwnd = tp->prior_cwnd; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + bbr->next_rtt_delivered = 0; + bbr->prev_ca_state = TCP_CA_Open; + bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->probe_rtt_min_us = tcp_min_rtt(tp); + bbr->probe_rtt_min_stamp = tcp_jiffies32; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_jiffies32; + + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); + + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp = 0; + bbr->cycle_idx = 0; + bbr->mode = BBR_STARTUP; + bbr->debug.rs_bw = 0; + + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + + bbr->ce_state = 0; + bbr->prior_rcv_nxt = tp->rcv_nxt; + bbr->try_fast_path = 0; + + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); +} + +static u32 bbr_sndbuf_expand(struct sock *sk) +{ + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; +} + +/* __________________________________________________________________________ + * + * Functions new to BBR v2 ("bbr") congestion control are below here. + * __________________________________________________________________________ + */ + +/* Incorporate a new bw sample into the current window of our max filter. */ +static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); +} + +/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ +static void bbr2_advance_bw_hi_filter(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (!bbr->bw_hi[1]) + return; /* no samples in this window; remember old window */ + bbr->bw_hi[0] = bbr->bw_hi[1]; + bbr->bw_hi[1] = 0; +} + +/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ +static u32 bbr2_target_inflight(struct sock *sk) +{ + u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + + return min(bdp, tcp_sk(sk)->snd_cwnd); +} + +static bool bbr2_is_probing_bandwidth(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return (bbr->mode == BBR_STARTUP) || + (bbr->mode == BBR_PROBE_BW && + (bbr->cycle_idx == BBR_BW_PROBE_REFILL || + bbr->cycle_idx == BBR_BW_PROBE_UP)); +} + +/* Has the given amount of time elapsed since we marked the phase start? */ +static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bbr *bbr = inet_csk_ca(sk); + + return tcp_stamp_us_delta(tp->tcp_mstamp, + bbr->cycle_mstamp + interval_us) > 0; +} + +static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw_reached = 1; + bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +} + +/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ +static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || + !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) + return; + + if (ce_ratio >= bbr->params.ecn_thresh) + bbr->startup_ecn_rounds++; + else + bbr->startup_ecn_rounds = 0; + + if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { + bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ + bbr2_handle_queue_too_high_in_startup(sk); + return; + } +} + +static void bbr2_update_ecn_alpha(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + s32 delivered, delivered_ce; + u64 alpha, ce_ratio; + u32 gain; + + if (bbr->params.ecn_factor == 0) + return; + + delivered = tp->delivered - bbr->alpha_last_delivered; + delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; + + if (delivered == 0 || /* avoid divide by zero */ + WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ + return; + + /* See if we should use ECN sender logic for this connection. */ + if (!bbr->ecn_eligible && bbr_ecn_enable && + (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || + !bbr->params.ecn_max_rtt_us)) + bbr->ecn_eligible = 1; + + ce_ratio = (u64)delivered_ce << BBR_SCALE; + do_div(ce_ratio, delivered); + gain = bbr->params.ecn_alpha_gain; + alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; + alpha += (gain * ce_ratio) >> BBR_SCALE; + bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); + + bbr->alpha_last_delivered = tp->delivered; + bbr->alpha_last_delivered_ce = tp->delivered_ce; + + bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); +} + +/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ +static void bbr2_raise_inflight_hi_slope(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 growth_this_round, cnt; + + /* Calculate "slope": packets S/Acked per inflight_hi increment. */ + growth_this_round = 1 << bbr->bw_probe_up_rounds; + bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); + cnt = tp->snd_cwnd / growth_this_round; + cnt = max(cnt, 1U); + bbr->bw_probe_up_cnt = cnt; + bbr->debug.event = 'G'; /* Grow inflight_hi slope */ +} + +/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ +static void bbr2_probe_inflight_hi_upward(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 delta; + + if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { + bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ + return; /* not fully using inflight_hi, so don't grow it */ + } + + /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ + bbr->bw_probe_up_acks += rs->acked_sacked; + if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { + delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; + bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; + bbr->inflight_hi += delta; + bbr->debug.event = 'I'; /* Increment inflight_hi */ + } + + if (bbr->round_start) + bbr2_raise_inflight_hi_slope(sk); +} + +/* Does loss/ECN rate for this sample say inflight is "too high"? + * This is used by both the bbr_check_loss_too_high_in_startup() function, + * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which + * uses it to notice when loss/ECN rates suggest inflight is too high. + */ +static bool bbr2_is_inflight_too_high(const struct sock *sk, + const struct rate_sample *rs) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh, ecn_thresh; + + if (rs->lost > 0 && rs->tx_in_flight) { + loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> + BBR_SCALE; + if (rs->lost > loss_thresh) + return true; + } + + if (rs->delivered_ce > 0 && rs->delivered > 0 && + bbr->ecn_eligible && bbr->params.ecn_thresh) { + ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> + BBR_SCALE; + if (rs->delivered_ce >= ecn_thresh) + return true; + } + + return false; +} + +/* Calculate the tx_in_flight level that corresponded to excessive loss. + * We find "lost_prefix" segs of the skb where loss rate went too high, + * by solving for "lost_prefix" in the following equation: + * lost / inflight >= loss_thresh + * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh + * Then we take that equation, convert it to fixed point, and + * round up to the nearest packet. + */ +static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, + const struct rate_sample *rs, + const struct sk_buff *skb) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh = bbr->params.loss_thresh; + u32 pcount, divisor, inflight_hi; + s32 inflight_prev, lost_prev; + u64 loss_budget, lost_prefix; + + pcount = tcp_skb_pcount(skb); + + /* How much data was in flight before this skb? */ + inflight_prev = rs->tx_in_flight - pcount; + if (WARN_ONCE(inflight_prev < 0, + "tx_in_flight: %u pcount: %u reneg: %u", + rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) + return ~0U; + + /* How much inflight data was marked lost before this skb? */ + lost_prev = rs->lost - pcount; + if (WARN_ON_ONCE(lost_prev < 0)) + return ~0U; + + /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ + loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; + loss_budget >>= BBR_SCALE; + if (lost_prev >= loss_budget) { + lost_prefix = 0; /* previous losses crossed loss_thresh */ + } else { + lost_prefix = loss_budget - lost_prev; + lost_prefix <<= BBR_SCALE; + divisor = BBR_UNIT - loss_thresh; + if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ + return ~0U; + do_div(lost_prefix, divisor); + } + + inflight_hi = inflight_prev + lost_prefix; + return inflight_hi; +} + +/* If loss/ECN rates during probing indicated we may have overfilled a + * buffer, return an operating point that tries to leave unutilized headroom in + * the path for other flows, for fairness convergence and lower RTTs and loss. + */ +static u32 bbr2_inflight_with_headroom(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 headroom, headroom_fraction; + + if (bbr->inflight_hi == ~0U) + return ~0U; + + headroom_fraction = bbr->params.inflight_headroom; + headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; + headroom = max(headroom, 1U); + return max_t(s32, bbr->inflight_hi - headroom, + bbr->params.cwnd_min_target); +} + +/* Bound cwnd to a sensible level, based on our current probing state + * machine phase and model of a good inflight level (inflight_lo, inflight_hi). + */ +static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cap; + + /* tcp_rcv_synsent_state_process() currently calls tcp_ack() + * and thus cong_control() without first initializing us(!). + */ + if (!bbr->initialized) + return; + + cap = ~0U; + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { + /* Probe to see if more packets fit in the path. */ + cap = bbr->inflight_hi; + } else { + if (bbr->mode == BBR_PROBE_RTT || + (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) + cap = bbr2_inflight_with_headroom(sk); + } + /* Adapt to any loss/ECN since our last bw probe. */ + cap = min(cap, bbr->inflight_lo); + + cap = max_t(u32, cap, bbr->params.cwnd_min_target); + tp->snd_cwnd = min(cap, tp->snd_cwnd); +} + +/* Estimate a short-term lower bound on the capacity available now, based + * on measurements of the current delivery process and recent history. When we + * are seeing loss/ECN at times when we are not probing bw, then conservatively + * move toward flow balance by multiplicatively cutting our short-term + * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a + * multiplicative decrease in order to converge to a lower capacity in time + * logarithmic in the magnitude of the decrease. + * + * However, we do not cut our short-term estimates lower than the current rate + * and volume of delivered data from this round trip, since from the current + * delivery process we can estimate the measured capacity available now. + * + * Anything faster than that approach would knowingly risk high loss, which can + * cause low bw for Reno/CUBIC and high loss recovery latency for + * request/response flows using any congestion control. + */ +static void bbr2_adapt_lower_bounds(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 ecn_cut, ecn_inflight_lo, beta; + + /* We only use lower-bound estimates when not probing bw. + * When probing we need to push inflight higher to probe bw. + */ + if (bbr2_is_probing_bandwidth(sk)) + return; + + /* ECN response. */ + if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { + /* Reduce inflight to (1 - alpha*ecn_factor). */ + ecn_cut = (BBR_UNIT - + ((bbr->ecn_alpha * bbr->params.ecn_factor) >> + BBR_SCALE)); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tp->snd_cwnd; + ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; + } else { + ecn_inflight_lo = ~0U; + } + + /* Loss response. */ + if (bbr->loss_in_round) { + /* Reduce bw and inflight to (1 - beta). */ + if (bbr->bw_lo == ~0U) + bbr->bw_lo = bbr_max_bw(sk); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tp->snd_cwnd; + beta = bbr->params.beta; + bbr->bw_lo = + max_t(u32, bbr->bw_latest, + (u64)bbr->bw_lo * + (BBR_UNIT - beta) >> BBR_SCALE); + bbr->inflight_lo = + max_t(u32, bbr->inflight_latest, + (u64)bbr->inflight_lo * + (BBR_UNIT - beta) >> BBR_SCALE); + } + + /* Adjust to the lower of the levels implied by loss or ECN. */ + bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); +} + +/* Reset any short-term lower-bound adaptation to congestion, so that we can + * push our inflight up. + */ +static void bbr2_reset_lower_bounds(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_lo = ~0U; + bbr->inflight_lo = ~0U; +} + +/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state + * machine phase where we adapt our lower bound based on congestion signals. + */ +static void bbr2_reset_congestion_signals(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->loss_in_cycle = 0; + bbr->ecn_in_cycle = 0; + bbr->bw_latest = 0; + bbr->inflight_latest = 0; +} + +/* Update (most of) our congestion signals: track the recent rate and volume of + * delivered data, presence of loss, and EWMA degree of ECN marking. + */ +static void bbr2_update_congestion_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->loss_round_start = 0; + if (rs->interval_us <= 0 || !rs->acked_sacked) + return; /* Not a valid observation */ + bw = ctx->sample_bw; + + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) + bbr2_take_bw_hi_sample(sk, bw); + + bbr->loss_in_round |= (rs->losses > 0); + + /* Update rate and volume of delivered data from latest round trip: */ + bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); + bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); + + if (before(rs->prior_delivered, bbr->loss_round_delivered)) + return; /* skip the per-round-trip updates */ + /* Now do per-round-trip updates. */ + bbr->loss_round_delivered = tp->delivered; /* mark round trip */ + bbr->loss_round_start = 1; + bbr2_adapt_lower_bounds(sk); + + /* Update windowed "latest" (single-round-trip) filters. */ + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->bw_latest = ctx->sample_bw; + bbr->inflight_latest = rs->delivered; +} + +/* Bandwidth probing can cause loss. To help coexistence with loss-based + * congestion control we spread out our probing in a Reno-conscious way. Due to + * the shape of the Reno sawtooth, the time required between loss epochs for an + * idealized Reno flow is a number of round trips that is the BDP of that + * flow. We count packet-timed round trips directly, since measured RTT can + * vary widely, and Reno is driven by packet-timed round trips. + */ +static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 inflight, rounds, reno_gain, reno_rounds; + + /* Random loss can shave some small percentage off of our inflight + * in each round. To survive this, flows need robust periodic probes. + */ + rounds = bbr->params.bw_probe_max_rounds; + + reno_gain = bbr->params.bw_probe_reno_gain; + if (reno_gain) { + inflight = bbr2_target_inflight(sk); + reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; + rounds = min(rounds, reno_rounds); + } + return bbr->rounds_since_probe >= rounds; +} + +/* How long do we want to wait before probing for bandwidth (and risking + * loss)? We randomize the wait, for better mixing and fairness convergence. + * + * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. + * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, + * (eg 4K video to a broadband user): + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + * + * We bound the BBR-native inter-bw-probe wall clock time to be: + * (a) higher than 2 sec: to try to avoid causing loss for a long enough time + * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must + * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs + * (b) lower than 3 sec: to ensure flows can start probing in a reasonable + * amount of time to discover unutilized bw on human-scale interactive + * time-scales (e.g. perhaps traffic from a web page download that we + * were competing with is now complete). + */ +static void bbr2_pick_probe_wait(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Decide the random round-trip bound for wait until probe: */ + bbr->rounds_since_probe = + get_random_u32_below(bbr->params.bw_probe_rand_rounds); + /* Decide the random wall clock bound for wait until probe: */ + bbr->probe_wait_us = bbr->params.bw_probe_base_us + + get_random_u32_below(bbr->params.bw_probe_rand_us); +} + +static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = cycle_idx; + /* New phase, so need to update cwnd and pacing rate. */ + bbr->try_fast_path = 0; +} + +/* Send at estimated bw to fill the pipe, but not queue. We need this phase + * before PROBE_UP, because as soon as we send faster than the available bw + * we will start building a queue, and if the buffer is shallow we can cause + * loss. If we do not fill the pipe before we cause this loss, our bw_hi and + * inflight_hi estimates will underestimate. + */ +static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_lower_bounds(sk); + if (bbr->inflight_hi != ~0U) + bbr->inflight_hi += bbr->params.refill_add_inc; + bbr->bw_probe_up_rounds = bw_probe_up_rounds; + bbr->bw_probe_up_acks = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_REFILLING; + bbr->next_rtt_delivered = tp->delivered; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); +} + +/* Now probe max deliverable data rate and volume. */ +static void bbr2_start_bw_probe_up(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->ack_phase = BBR_ACKS_PROBE_STARTING; + bbr->next_rtt_delivered = tp->delivered; + bbr->cycle_mstamp = tp->tcp_mstamp; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); + bbr2_raise_inflight_hi_slope(sk); +} + +/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall + * clock time at which to probe beyond an inflight that we think to be + * safe. This will knowingly risk packet loss, so we want to do this rarely, to + * keep packet loss rates low. Also start a round-trip counter, to probe faster + * if we estimate a Reno flow at our BDP would probe faster. + */ +static void bbr2_start_bw_probe_down(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_congestion_signals(sk); + bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ + bbr2_pick_probe_wait(sk); + bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); +} + +/* Cruise: maintain what we estimate to be a neutral, conservative + * operating point, without attempting to probe up for bandwidth or down for + * RTT, and only reducing inflight in response to loss/ECN signals. + */ +static void bbr2_start_bw_probe_cruise(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->inflight_lo != ~0U) + bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); + + bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); +} + +/* Loss and/or ECN rate is too high while probing. + * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. + */ +static void bbr2_handle_inflight_too_high(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + const u32 beta = bbr->params.beta; + + bbr->prev_probe_too_high = 1; + bbr->bw_probe_samples = 0; /* only react once per probe */ + bbr->debug.event = 'L'; /* Loss/ECN too high */ + /* If we are app-limited then we are not robustly + * probing the max volume of inflight data we think + * might be safe (analogous to how app-limited bw + * samples are not known to be robustly probing bw). + */ + if (!rs->is_app_limited) + bbr->inflight_hi = max_t(u32, rs->tx_in_flight, + (u64)bbr2_target_inflight(sk) * + (BBR_UNIT - beta) >> BBR_SCALE); + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr2_start_bw_probe_down(sk); +} + +/* If we're seeing bw and loss samples reflecting our bw probing, adapt + * using the signals we see. If loss or ECN mark rate gets too high, then adapt + * inflight_hi downward. If we're able to push inflight higher without such + * signals, push higher: adapt inflight_hi upward. + */ +static bool bbr2_adapt_upper_bounds(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Track when we'll see bw/loss samples resulting from our bw probes. */ + if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) + bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; + if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { + /* End of samples from bw probing phase. */ + bbr->bw_probe_samples = 0; + bbr->ack_phase = BBR_ACKS_INIT; + /* At this point in the cycle, our current bw sample is also + * our best recent chance at finding the highest available bw + * for this flow. So now is the best time to forget the bw + * samples from the previous cycle, by advancing the window. + */ + if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) + bbr2_advance_bw_hi_filter(sk); + /* If we had an inflight_hi, then probed and pushed inflight all + * the way up to hit that inflight_hi without seeing any + * high loss/ECN in all the resulting ACKs from that probing, + * then probe up again, this time letting inflight persist at + * inflight_hi for a round trip, then accelerating beyond. + */ + if (bbr->mode == BBR_PROBE_BW && + bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { + bbr->debug.event = 'R'; /* reprobe */ + bbr2_start_bw_probe_refill(sk, 0); + return true; /* yes, decided state transition */ + } + } + + if (bbr2_is_inflight_too_high(sk, rs)) { + if (bbr->bw_probe_samples) /* sample is from bw probing? */ + bbr2_handle_inflight_too_high(sk, rs); + } else { + /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ + if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ + return false; + + /* To be resilient to random loss, we must raise inflight_hi + * if we observe in any phase that a higher level is safe. + */ + if (rs->tx_in_flight > bbr->inflight_hi) { + bbr->inflight_hi = rs->tx_in_flight; + bbr->debug.event = 'U'; /* raise up inflight_hi */ + } + + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr2_probe_inflight_hi_upward(sk, rs); + } + + return false; +} + +/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ +static bool bbr2_check_time_to_probe_bw(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 n; + + /* If we seem to be at an operating point where we are not seeing loss + * but we are seeing ECN marks, then when the ECN marks cease we reprobe + * quickly (in case a burst of cross-traffic has ceased and freed up bw, + * or in case we are sharing with multiplicatively probing traffic). + */ + if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && + bbr->ecn_in_cycle && !bbr->loss_in_cycle && + inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { + bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ + /* Calculate n so that when bbr2_raise_inflight_hi_slope() + * computes growth_this_round as 2^n it will be roughly the + * desired volume of data (inflight_hi*ecn_reprobe_gain). + */ + n = ilog2((((u64)bbr->inflight_hi * + bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); + bbr2_start_bw_probe_refill(sk, n); + return true; + } + + if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || + bbr2_is_reno_coexistence_probe_time(sk)) { + bbr2_start_bw_probe_refill(sk, 0); + return true; + } + return false; +} + +/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ +static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + bool is_under_bdp, is_long_enough; + + /* Always need to pull inflight down to leave headroom in queue. */ + if (inflight > bbr2_inflight_with_headroom(sk)) + return false; + + is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); + if (bbr->params.drain_to_target) + return is_under_bdp; + + is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); + return is_under_bdp || is_long_enough; +} + +/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ +static void bbr2_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + bool is_risky = false, is_queuing = false; + u32 inflight, bw; + + if (!bbr_full_bw_reached(sk)) + return; + + /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ + if (bbr2_adapt_upper_bounds(sk, rs)) + return; /* already decided state transition */ + + if (bbr->mode != BBR_PROBE_BW) + return; + + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); + bw = bbr_max_bw(sk); + + switch (bbr->cycle_idx) { + /* First we spend most of our time cruising with a pacing_gain of 1.0, + * which paces at the estimated bw, to try to fully use the pipe + * without building queue. If we encounter loss/ECN marks, we adapt + * by slowing down. + */ + case BBR_BW_PROBE_CRUISE: + if (bbr2_check_time_to_probe_bw(sk)) + return; /* already decided state transition */ + break; + + /* After cruising, when it's time to probe, we first "refill": we send + * at the estimated bw to fill the pipe, before probing higher and + * knowingly risking overflowing the bottleneck buffer (causing loss). + */ + case BBR_BW_PROBE_REFILL: + if (bbr->round_start) { + /* After one full round trip of sending in REFILL, we + * start to see bw samples reflecting our REFILL, which + * may be putting too much data in flight. + */ + bbr->bw_probe_samples = 1; + bbr2_start_bw_probe_up(sk); + } + break; + + /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to + * probe for bw. If we have not seen loss/ECN, we try to raise inflight + * to at least pacing_gain*BDP; note that this may take more than + * min_rtt if min_rtt is small (e.g. on a LAN). + * + * We terminate PROBE_UP bandwidth probing upon any of the following: + * + * (1) We've pushed inflight up to hit the inflight_hi target set in the + * most recent previous bw probe phase. Thus we want to start + * draining the queue immediately because it's very likely the most + * recently sent packets will fill the queue and cause drops. + * (checked here) + * (2) We have probed for at least 1*min_rtt_us, and the + * estimated queue is high enough (inflight > 1.25 * estimated_bdp). + * (checked here) + * (3) Loss filter says loss rate is "too high". + * (checked in bbr_is_inflight_too_high()) + * (4) ECN filter says ECN mark rate is "too high". + * (checked in bbr_is_inflight_too_high()) + */ + case BBR_BW_PROBE_UP: + if (bbr->prev_probe_too_high && + inflight >= bbr->inflight_hi) { + bbr->stopped_risky_probe = 1; + is_risky = true; + bbr->debug.event = 'D'; /* D for danger */ + } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && + inflight >= + bbr_inflight(sk, bw, + bbr->params.bw_probe_pif_gain)) { + is_queuing = true; + bbr->debug.event = 'Q'; /* building Queue */ + } + if (is_risky || is_queuing) { + bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ + bbr2_start_bw_probe_down(sk); /* restart w/ down */ + } + break; + + /* After probing in PROBE_UP, we have usually accumulated some data in + * the bottleneck buffer (if bw probing didn't find more bw). We next + * enter PROBE_DOWN to try to drain any excess data from the queue. To + * do this, we use a pacing_gain < 1.0. We hold this pacing gain until + * our inflight is less then that target cruising point, which is the + * minimum of (a) the amount needed to leave headroom, and (b) the + * estimated BDP. Once inflight falls to match the target, we estimate + * the queue is drained; persisting would underutilize the pipe. + */ + case BBR_BW_PROBE_DOWN: + if (bbr2_check_time_to_probe_bw(sk)) + return; /* already decided state transition */ + if (bbr2_check_time_to_cruise(sk, inflight, bw)) + bbr2_start_bw_probe_cruise(sk); + break; + + default: + WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); + } +} + +/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ +static void bbr2_exit_probe_rtt(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr2_reset_lower_bounds(sk); + if (bbr_full_bw_reached(sk)) { + bbr->mode = BBR_PROBE_BW; + /* Raising inflight after PROBE_RTT may cause loss, so reset + * the PROBE_BW clock and schedule the next bandwidth probe for + * a friendly and randomized future point in time. + */ + bbr2_start_bw_probe_down(sk); + /* Since we are exiting PROBE_RTT, we know inflight is + * below our estimated BDP, so it is reasonable to cruise. + */ + bbr2_start_bw_probe_cruise(sk); + } else { + bbr->mode = BBR_STARTUP; + } +} + +/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until + * the end of the round in recovery to get a good estimate of how many packets + * have been lost, and how many we need to drain with a low pacing rate. + */ +static void bbr2_check_loss_too_high_in_startup(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk)) + return; + + /* For STARTUP exit, check the loss rate at the end of each round trip + * of Recovery episodes in STARTUP. We check the loss rate at the end + * of the round trip to filter out noisy/low loss and have a better + * sense of inflight (extent of loss), so we can drain more accurately. + */ + if (rs->losses && bbr->loss_events_in_round < 0xf) + bbr->loss_events_in_round++; /* update saturating counter */ + if (bbr->params.full_loss_cnt && bbr->loss_round_start && + inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && + bbr->loss_events_in_round >= bbr->params.full_loss_cnt && + bbr2_is_inflight_too_high(sk, rs)) { + bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ + bbr2_handle_queue_too_high_in_startup(sk); + return; + } + if (bbr->loss_round_start) + bbr->loss_events_in_round = 0; +} + +/* If we are done draining, advance into steady state operation in PROBE_BW. */ +static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_check_drain(sk, rs, ctx)) { + bbr->mode = BBR_PROBE_BW; + bbr2_start_bw_probe_down(sk); + } +} + +static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + bbr2_update_congestion_signals(sk, rs, ctx); + bbr_update_ack_aggregation(sk, rs); + bbr2_check_loss_too_high_in_startup(sk, rs); + bbr_check_full_bw_reached(sk, rs); + bbr2_check_drain(sk, rs, ctx); + bbr2_update_cycle_phase(sk, rs); + bbr_update_min_rtt(sk, rs); +} + +/* Fast path for app-limited case. + * + * On each ack, we execute bbr state machine, which primarily consists of: + * 1) update model based on new rate sample, and + * 2) update control based on updated model or state change. + * + * There are certain workload/scenarios, e.g. app-limited case, where + * either we can skip updating model or we can skip update of both model + * as well as control. This provides signifcant softirq cpu savings for + * processing incoming acks. + * + * In case of app-limited, if there is no congestion (loss/ecn) and + * if observed bw sample is less than current estimated bw, then we can + * skip some of the computation in bbr state processing: + * + * - if there is no rtt/mode/phase change: In this case, since all the + * parameters of the network model are constant, we can skip model + * as well control update. + * + * - else we can skip rest of the model update. But we still need to + * update the control to account for the new rtt/mode/phase. + * + * Returns whether we can take fast path or not. + */ +static bool bbr2_fast_path(struct sock *sk, bool *update_model, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 prev_min_rtt_us, prev_mode; + + if (bbr->params.fast_path && bbr->try_fast_path && + rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && + !bbr->loss_in_round && !bbr->ecn_in_round) { + prev_mode = bbr->mode; + prev_min_rtt_us = bbr->min_rtt_us; + bbr2_check_drain(sk, rs, ctx); + bbr2_update_cycle_phase(sk, rs); + bbr_update_min_rtt(sk, rs); + + if (bbr->mode == prev_mode && + bbr->min_rtt_us == prev_min_rtt_us && + bbr->try_fast_path) + return true; + + /* Skip model update, but control still needs to be updated */ + *update_model = false; + } + return false; +} + +static void bbr2_main(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct bbr_context ctx = { 0 }; + bool update_model = true; + u32 bw; + + bbr->debug.event = '.'; /* init to default NOP (no event yet) */ + + bbr_update_round_start(sk, rs, &ctx); + if (bbr->round_start) { + bbr->rounds_since_probe = + min_t(s32, bbr->rounds_since_probe + 1, 0xFF); + bbr2_update_ecn_alpha(sk); + } + + bbr->ecn_in_round |= rs->is_ece; + bbr_calculate_bw_sample(sk, rs, &ctx); + + if (bbr2_fast_path(sk, &update_model, rs, &ctx)) + goto out; + + if (update_model) + bbr2_update_model(sk, rs, &ctx); + + bbr_update_gains(sk); + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, + tp->snd_cwnd, &ctx); + bbr2_bound_cwnd_for_inflight_model(sk); + +out: + bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; + bbr->loss_in_cycle |= rs->lost > 0; + bbr->ecn_in_cycle |= rs->delivered_ce > 0; + + bbr_debug(sk, rs->acked_sacked, rs, &ctx); +} + +/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared + * down here, so that the algorithm functions that use the parameters must use + * the per-socket parameters; if they accidentally use the global version + * then there will be a compile error. + * TODO(ncardwell): move all per-socket parameters down to this section. + */ + +/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. + * No loss response when 0. Max allwed value is 255. + */ +static u32 bbr_beta = BBR_UNIT * 30 / 100; + +/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. + * Max allowed value is 255. + */ +static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ + +/* The initial value for the ecn_alpha state variable. Default and max + * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly + * to congestion if the bottleneck is congested when the flow starts up. + */ +static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ + +/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. + * No ECN based bounding when 0. Max allwed value is 255. + */ +static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ + +/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. + * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. + */ +static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ + +/* Max RTT (in usec) at which to use sender-side ECN logic. + * Disabled when 0 (ECN allowed at any RTT). + * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. + */ +static u32 bbr_ecn_max_rtt_us = 5000; + +/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN + * clears then use a multiplicative increase to quickly reprobe bw by + * starting inflight probing at the given multiple of inflight_hi. + * Default for this experimental knob is 0 (disabled). + * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. + */ +static u32 bbr_ecn_reprobe_gain; + +/* Estimate bw probing has gone too far if loss rate exceeds this level. */ +static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ + +/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, + * and loss rate is higher than bbr_loss_thresh. + * Disabled if 0. Max allowed value is 15 (0xF). + */ +static u32 bbr_full_loss_cnt = 8; + +/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh + * meets this count. Max allowed value is 3. + */ +static u32 bbr_full_ecn_cnt = 2; + +/* Fraction of unutilized headroom to try to leave in path upon high loss. */ +static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; + +/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. + * Default is 1.25x, as in BBR v1. Max allowed is 511. + */ +static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; + +/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. + * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. + * Max allowed is 511. + */ +static u32 bbr_bw_probe_reno_gain = BBR_UNIT; + +/* Max number of packet-timed rounds to wait before probing for bandwidth. If + * we want to tolerate 1% random loss per round, and not have this cut our + * inflight too much, we must probe for bw periodically on roughly this scale. + * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. + * We aim to be fair with Reno/CUBIC up to a BDP of at least: + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + */ +static u32 bbr_bw_probe_max_rounds = 63; + +/* Max amount of randomness to inject in round counting for Reno-coexistence. + * Max value is 15. + */ +static u32 bbr_bw_probe_rand_rounds = 2; + +/* Use BBR-native probe time scale starting at this many usec. + * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: + * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs + */ +static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ + +/* Use BBR-native probes spread over this many usec: */ +static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ + +/* Undo the model changes made in loss recovery if recovery was spurious? */ +static bool bbr_undo = true; + +/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ +static bool bbr_fast_path = true; /* default: enabled */ + +/* Use fast ack mode ? */ +static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ + +/* How much to additively increase inflight_hi when entering REFILL? */ +static u32 bbr_refill_add_inc; /* default: disabled */ + +module_param_named(beta, bbr_beta, uint, 0644); +module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); +module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); +module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); +module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); +module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); +module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); +module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); +module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); +module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); +module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); +module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); +module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); +module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); +module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); +module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); +module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); +module_param_named(undo, bbr_undo, bool, 0664); +module_param_named(fast_path, bbr_fast_path, bool, 0664); +module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); +module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); + +static void bbr2_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr_init(sk); /* run shared init code for v1 and v2 */ + + /* BBR v2 parameters: */ + bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); + bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); + bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); + bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); + bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); + bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); + bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); + bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); + bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); + bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); + bbr->params.inflight_headroom = + min_t(u32, 0xFFU, bbr_inflight_headroom); + bbr->params.bw_probe_pif_gain = + min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); + bbr->params.bw_probe_reno_gain = + min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); + bbr->params.bw_probe_max_rounds = + min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); + bbr->params.bw_probe_rand_rounds = + min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); + bbr->params.bw_probe_base_us = + min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); + bbr->params.bw_probe_rand_us = + min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); + bbr->params.undo = bbr_undo; + bbr->params.fast_path = bbr_fast_path ? 1 : 0; + bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); + + /* BBR v2 state: */ + bbr->initialized = 1; + /* Start sampling ECN mark rate after first full flight is ACKed: */ + bbr->loss_round_delivered = tp->delivered + 1; + bbr->loss_round_start = 0; + bbr->undo_bw_lo = 0; + bbr->undo_inflight_lo = 0; + bbr->undo_inflight_hi = 0; + bbr->loss_events_in_round = 0; + bbr->startup_ecn_rounds = 0; + bbr2_reset_congestion_signals(sk); + bbr->bw_lo = ~0U; + bbr->bw_hi[0] = 0; + bbr->bw_hi[1] = 0; + bbr->inflight_lo = ~0U; + bbr->inflight_hi = ~0U; + bbr->bw_probe_up_cnt = ~0U; + bbr->bw_probe_up_acks = 0; + bbr->bw_probe_up_rounds = 0; + bbr->probe_wait_us = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_INIT; + bbr->rounds_since_probe = 0; + bbr->bw_probe_samples = 0; + bbr->prev_probe_too_high = 0; + bbr->ecn_eligible = 0; + bbr->ecn_alpha = bbr->params.ecn_alpha_init; + bbr->alpha_last_delivered = 0; + bbr->alpha_last_delivered_ce = 0; + + tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); + + if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable) + tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; +} + +/* Core TCP stack informs us that the given skb was just marked lost. */ +static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct rate_sample rs; + + /* Capture "current" data over the full round trip of loss, + * to have a better chance to see the full capacity of the path. + */ + if (!bbr->loss_in_round) /* first loss in this round trip? */ + bbr->loss_round_delivered = tp->delivered; /* set round trip */ + bbr->loss_in_round = 1; + bbr->loss_in_cycle = 1; + + if (!bbr->bw_probe_samples) + return; /* not an skb sent while probing for bandwidth */ + if (unlikely(!scb->tx.delivered_mstamp)) + return; /* skb was SACKed, reneged, marked lost; ignore it */ + /* We are probing for bandwidth. Construct a rate sample that + * estimates what happened in the flight leading up to this lost skb, + * then see if the loss rate went too high, and if so at which packet. + */ + memset(&rs, 0, sizeof(rs)); + rs.tx_in_flight = scb->tx.in_flight; + rs.lost = tp->lost - scb->tx.lost; + rs.is_app_limited = scb->tx.is_app_limited; + if (bbr2_is_inflight_too_high(sk, &rs)) { + rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); + bbr2_handle_inflight_too_high(sk, &rs); + } +} + +/* Revert short-term model if current loss recovery event was spurious. */ +static u32 bbr2_undo_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->debug.undo = 1; + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; + bbr->loss_in_round = 0; + + if (!bbr->params.undo) + return tp->snd_cwnd; + + /* Revert to cwnd and other state saved before loss episode. */ + bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); + bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); + bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); + return bbr->prior_cwnd; +} + +/* Entering loss recovery, so save state for when we undo recovery. */ +static u32 bbr2_ssthresh(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr_save_cwnd(sk); + /* For undo, save state that adapts based on loss signal. */ + bbr->undo_bw_lo = bbr->bw_lo; + bbr->undo_inflight_lo = bbr->inflight_lo; + bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; +} + +static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) +{ + switch (bbr->mode) { + case BBR_STARTUP: + return BBR2_PHASE_STARTUP; + case BBR_DRAIN: + return BBR2_PHASE_DRAIN; + case BBR_PROBE_BW: + break; + case BBR_PROBE_RTT: + return BBR2_PHASE_PROBE_RTT; + default: + return BBR2_PHASE_INVALID; + } + switch (bbr->cycle_idx) { + case BBR_BW_PROBE_UP: + return BBR2_PHASE_PROBE_BW_UP; + case BBR_BW_PROBE_DOWN: + return BBR2_PHASE_PROBE_BW_DOWN; + case BBR_BW_PROBE_CRUISE: + return BBR2_PHASE_PROBE_BW_CRUISE; + case BBR_BW_PROBE_REFILL: + return BBR2_PHASE_PROBE_BW_REFILL; + default: + return BBR2_PHASE_INVALID; + } +} + +static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); + u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); + u64 bw_lo = bbr->bw_lo == ~0U ? + ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); + + memset(&info->bbr2, 0, sizeof(info->bbr2)); + info->bbr2.bbr_bw_lsb = (u32)bw; + info->bbr2.bbr_bw_msb = (u32)(bw >> 32); + info->bbr2.bbr_min_rtt = bbr->min_rtt_us; + info->bbr2.bbr_pacing_gain = bbr->pacing_gain; + info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; + info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; + info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); + info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; + info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); + info->bbr2.bbr_mode = bbr->mode; + info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); + info->bbr2.bbr_version = (__u8)2; + info->bbr2.bbr_inflight_lo = bbr->inflight_lo; + info->bbr2.bbr_inflight_hi = bbr->inflight_hi; + info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; + return sizeof(info->bbr2); + } + return 0; +} + +static void bbr2_set_state(struct sock *sk, u8 new_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + struct rate_sample rs = { .losses = 1 }; + struct bbr_context ctx = { 0 }; + + bbr->prev_ca_state = TCP_CA_Loss; + bbr->full_bw = 0; + if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { + /* bbr_adapt_lower_bounds() needs cwnd before + * we suffered an RTO, to update inflight_lo: + */ + bbr->inflight_lo = + max(tp->snd_cwnd, bbr->prior_cwnd); + } + bbr_debug(sk, 0, &rs, &ctx); + } else if (bbr->prev_ca_state == TCP_CA_Loss && + new_state != TCP_CA_Loss) { + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr->try_fast_path = 0; /* bound cwnd using latest model */ + } +} + +static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { + .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr2", + .owner = THIS_MODULE, + .init = bbr2_init, + .cong_control = bbr2_main, + .sndbuf_expand = bbr_sndbuf_expand, + .skb_marked_lost = bbr2_skb_marked_lost, + .undo_cwnd = bbr2_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr2_ssthresh, + .tso_segs = bbr_tso_segs, + .get_info = bbr2_get_info, + .set_state = bbr2_set_state, +}; + +static int __init bbr_register(void) +{ + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_bbr2_cong_ops); +} + +static void __exit bbr_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); +} + +module_init(bbr_register); +module_exit(bbr_unregister); + +MODULE_AUTHOR("Van Jacobson "); +MODULE_AUTHOR("Neal Cardwell "); +MODULE_AUTHOR("Yuchung Cheng "); +MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_AUTHOR("Priyaranjan Jha "); +MODULE_AUTHOR("Yousuk Seung "); +MODULE_AUTHOR("Kevin Yang "); +MODULE_AUTHOR("Arjun Roy "); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d3cae40749e8..0f268f2ff2e9 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; + tcp_sk(sk)->fast_ack_mode = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0640453fce54..8a455eb0c552 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { @@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; @@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { + struct sock *sk = (struct sock *)tp; + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tp->lost += tcp_skb_pcount(skb); + if (ca_ops->skb_marked_lost) + ca_ops->skb_marked_lost(sk, skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) @@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); + /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ + if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, + "prev in_flight: %u skb in_flight: %u pcount: %u", + TCP_SKB_CB(prev)->tx.in_flight, + TCP_SKB_CB(skb)->tx.in_flight, + pcount)) + TCP_SKB_CB(skb)->tx.in_flight = 0; + else + TCP_SKB_CB(skb)->tx.in_flight -= pcount; + TCP_SKB_CB(prev)->tx.in_flight += pcount; + /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep @@ -3812,6 +3828,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); + tcp_rate_check_app_limited(sk); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3910,6 +3927,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); + rs.is_ece = !!(flag & FLAG_ECE); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); @@ -5509,13 +5527,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && + (tp->fast_ack_mode == 1 || /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ - (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || - __tcp_select_window(sk) >= tp->rcv_wnd)) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd))) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c69f4d966024..a9ceec2702b2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else if (!tcp_ca_needs_ecn(sk)) { + } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && + !tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -1533,7 +1534,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize, old_factor; + int nsize, old_factor, inflight_prev; long limit; int nlen; u8 flags; @@ -1610,6 +1611,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (diff) tcp_adjust_pcount(sk, skb, diff); + + /* Set buff tx.in_flight as if buff were sent by itself. */ + inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; + if (WARN_ONCE(inflight_prev < 0, + "inconsistent: tx.in_flight: %u old_factor: %d", + TCP_SKB_CB(skb)->tx.in_flight, old_factor)) + inflight_prev = 0; + TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + + tcp_skb_pcount(buff); } /* Link BUFF into the send queue. */ @@ -1993,13 +2003,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 min_tso, tso_segs; - - min_tso = ca_ops->min_tso_segs ? - ca_ops->min_tso_segs(sk) : - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + u32 tso_segs; - tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); + tso_segs = ca_ops->tso_segs ? + ca_ops->tso_segs(sk, mss_now) : + tcp_tso_autosize(sk, mss_now, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs)); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } @@ -2635,6 +2644,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); + tcp_set_tx_in_flight(sk, skb); goto repair; /* Skip network transmission */ } diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index a8f6d9d06f2e..a8b4c9504570 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ * ready to send in the write queue. */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 in_flight; + + /* Check, sanitize, and record packets in flight after skb was sent. */ + in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); + if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, + "insane in_flight %u cc %s mss %u " + "cwnd %u pif %u %u %u %u\n", + in_flight, inet_csk(sk)->icsk_ca_ops->name, + tp->mss_cache, tp->snd_cwnd, + tp->packets_out, tp->retrans_out, + tp->sacked_out, tp->lost_out)) + in_flight = TCPCB_IN_FLIGHT_MAX; + TCP_SKB_CB(skb)->tx.in_flight = in_flight; +} + /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.lost = tp->lost; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + tcp_set_tx_in_flight(sk, skb); } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, scb->end_seq, rs->last_end_seq)) { + rs->prior_lost = scb->tx.lost; rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->last_end_seq = scb->end_seq; + rs->tx_in_flight = scb->tx.in_flight; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp32_us_delta( + tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being @@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, return; } rs->delivered = tp->delivered - rs->prior_delivered; + rs->lost = tp->lost - rs->prior_lost; rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; /* delivered_ce occupies less than 32 bits in the skb control block */ @@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index cb79127f45c3..70e4de876a7f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -605,6 +605,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } + tcp_rate_check_app_limited(sk); tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; -- 2.40.0 From 6475d8e548032ddeea601b7f35e28baa253c3c31 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 9 Feb 2023 11:52:21 +0100 Subject: [PATCH 02/17] bfq Signed-off-by: Peter Jung --- block/bfq-cgroup.c | 121 ++++---- block/bfq-iosched.c | 722 ++++++++++++++++++++++++++------------------ block/bfq-iosched.h | 182 ++++++++--- block/bfq-wf2q.c | 159 +++++----- 4 files changed, 713 insertions(+), 471 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 60b4299bec8e..49aa18201011 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } @@ -316,14 +316,12 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) static void bfqg_get(struct bfq_group *bfqg) { - bfqg->ref++; + refcount_inc(&bfqg->ref); } static void bfqg_put(struct bfq_group *bfqg) { - bfqg->ref--; - - if (bfqg->ref == 0) + if (refcount_dec_and_test(&bfqg->ref)) kfree(bfqg); } @@ -530,7 +528,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, } /* see comments in bfq_bic_update_cgroup for why refcounting */ - bfqg_get(bfqg); + refcount_set(&bfqg->ref, 1); return &bfqg->pd; } @@ -552,6 +550,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->num_queues_with_pending_reqs = 0; bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -645,6 +644,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); + bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when @@ -665,6 +665,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->ref++; + if (entity->in_groups_with_pending_reqs) { + has_pending_reqs = true; + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + } + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -692,18 +697,61 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); + if (has_pending_reqs) + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); + if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq); } - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } +static void bfq_sync_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *sync_bfqq, + struct bfq_io_cq *bic, + struct bfq_group *bfqg, + unsigned int act_idx) +{ + struct bfq_queue *bfqq; + + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + return; + } + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is not valid + * anymore. We cannot easily just cancel the merge (by + * clearing new_bfqq) as there may be other processes + * using this queue and holding refs to all queues + * below sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from bfqq now + * so that we cannot merge bio to a request from the + * old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bic_set_bfqq(bic, NULL, true, act_idx); + bfq_release_process_ref(bfqd, sync_bfqq); + } +} + /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. @@ -714,60 +762,25 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) */ -static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct bfq_group *bfqg) +static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); - struct bfq_entity *entity; + unsigned int act_idx; - if (async_bfqq) { - entity = &async_bfqq->entity; + for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, false); + if (async_bfqq && + async_bfqq->entity.sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } - } - if (sync_bfqq) { - if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { - /* We are the only user of this bfqq, just move it */ - if (sync_bfqq->entity.sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); - } else { - struct bfq_queue *bfqq; - - /* - * The queue was merged to a different queue. Check - * that the merge chain still belongs to the same - * cgroup. - */ - for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) - if (bfqq->entity.sched_data != - &bfqg->sched_data) - break; - if (bfqq) { - /* - * Some queue changed cgroup so the merge is - * not valid anymore. We cannot easily just - * cancel the merge (by clearing new_bfqq) as - * there may be other processes using this - * queue and holding refs to all queues below - * sync_bfqq->new_bfqq. Similarly if the merge - * already happened, we need to detach from - * bfqq now so that we cannot merge bio to a - * request from the old cgroup. - */ - bfq_put_cooperator(sync_bfqq); - bic_set_bfqq(bic, NULL, true); - bfq_release_process_ref(bfqd, sync_bfqq); - } - } + if (sync_bfqq) + bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } - - return bfqg; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ff9d23889415..6cae412a33a0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600; #define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - return bic->bfqq[is_sync]; + if (is_sync) + return bic->bfqq[1][actuator_idx]; + + return bic->bfqq[0][actuator_idx]; } static void bfq_put_stable_ref(struct bfq_queue *bfqq); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) +void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, + bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; - - /* Clear bic pointer if bfqq is detached from this bic */ - if (old_bfqq && old_bfqq->bic == bic) - old_bfqq->bic = NULL; + struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx]; /* * If bfqq != NULL, then a non-stable queue merge between @@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) * we cancel the stable merge if * bic->stable_merge_bfqq == bfqq. */ - bic->bfqq[is_sync] = bfqq; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx]; + + /* Clear bic pointer if bfqq is detached from this bic */ + if (old_bfqq && old_bfqq->bic == bic) + old_bfqq->bic = NULL; + + if (is_sync) + bic->bfqq[1][actuator_idx] = bfqq; + else + bic->bfqq[0][actuator_idx] = bfqq; - if (bfqq && bic->stable_merge_bfqq == bfqq) { + if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) { /* * Actually, these same instructions are executed also * in bfq_setup_cooperator, in case of abort or actual @@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) * did so, we would nest even more complexity in this * function. */ - bfq_put_stable_ref(bic->stable_merge_bfqq); + bfq_put_stable_ref(bfqq_data->stable_merge_bfqq); - bic->stable_merge_bfqq = NULL; + bfqq_data->stable_merge_bfqq = NULL; } } @@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL; int depth; unsigned limit = data->q->nr_requests; + unsigned int act_idx; /* Sync reads have full depth available */ if (op_is_sync(opf) && !op_is_write(opf)) { @@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) limit = (limit * depth) >> bfqd->full_depth_shift; } - /* - * Does queue (or any parent entity) exceed number of requests that - * should be available to it? Heavily limit depth so that it cannot - * consume more available requests and thus starve other entities. - */ - if (bfqq && bfqq_request_over_limit(bfqq, limit)) - depth = 1; + for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *bfqq = + bic_to_bfqq(bic, op_is_sync(opf), act_idx); + /* + * Does queue (or any parent entity) exceed number of + * requests that should be available to it? Heavily + * limit depth so that it cannot consume more + * available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) { + depth = 1; + break; + } + } bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); if (depth) @@ -826,7 +845,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) * much easier to maintain the needed state: * 1) all active queues have the same weight, * 2) all active queues belong to the same I/O-priority class, - * 3) there are no active groups. + * 3) there is at most one active group. * In particular, the last condition is always true if hierarchical * support or the cgroups interface are not enabled, thus no state * needs to be maintained in this case. @@ -858,7 +877,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, return varied_queue_weights || multiple_classes_busy #ifdef CONFIG_BFQ_GROUP_IOSCHED - || bfqd->num_groups_with_pending_reqs > 0 + || bfqd->num_groups_with_pending_reqs > 1 #endif ; } @@ -876,9 +895,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */ -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_add(struct bfq_queue *bfqq) { + struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree; struct bfq_entity *entity = &bfqq->entity; struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; bool leftmost = true; @@ -950,13 +969,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_remove(struct bfq_queue *bfqq) { + struct rb_root_cached *root; + if (!bfqq->weight_counter) return; + root = &bfqq->bfqd->queue_weights_tree; bfqq->weight_counter->num_active--; if (bfqq->weight_counter->num_active > 0) goto reset_entity_pointer; @@ -969,59 +989,6 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd, bfq_put_queue(bfqq); } -/* - * Invoke __bfq_weights_tree_remove on bfqq and decrement the number - * of active groups for each queue's inactive parent entity. - */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = bfqq->entity.parent; - - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->my_sched_data; - - if (sd->next_in_service || sd->in_service_entity) { - /* - * entity is still active, because either - * next_in_service or in_service_entity is not - * NULL (see the comments on the definition of - * next_in_service for details on why - * in_service_entity must be checked too). - * - * As a consequence, its parent entities are - * active as well, and thus this loop must - * stop here. - */ - break; - } - - /* - * The decrement of num_groups_with_pending_reqs is - * not performed immediately upon the deactivation of - * entity, but it is delayed to when it also happens - * that the first leaf descendant bfqq of entity gets - * all its pending requests completed. The following - * instructions perform this delayed decrement, if - * needed. See the comments on - * num_groups_with_pending_reqs for details. - */ - if (entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = false; - bfqd->num_groups_with_pending_reqs--; - } - } - - /* - * Next function is invoked last, because it causes bfqq to be - * freed if the following holds: bfqq is not in service and - * has no dispatched request. DO NOT use bfqq after the next - * function invocation. - */ - __bfq_weights_tree_remove(bfqd, bfqq, - &bfqd->queue_weights_tree); -} - /* * Return expired entry, or NULL to just start from scratch in rbtree. */ @@ -1126,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - dur = bfqd->rate_dur_prod; do_div(dur, bfqd->peak_rate); @@ -1170,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, { unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - if (bic->saved_has_short_ttime) + if (bfqq_data->saved_has_short_ttime) bfq_mark_bfqq_has_short_ttime(bfqq); else bfq_clear_bfqq_has_short_ttime(bfqq); - if (bic->saved_IO_bound) + if (bfqq_data->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; - bfqq->inject_limit = bic->saved_inject_limit; - bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns; + bfqq->inject_limit = bfqq_data->saved_inject_limit; + bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif; - bfqq->entity.new_weight = bic->saved_weight; - bfqq->ttime = bic->saved_ttime; - bfqq->io_start_time = bic->saved_io_start_time; - bfqq->tot_idle_time = bic->saved_tot_idle_time; + bfqq->entity.new_weight = bfqq_data->saved_weight; + bfqq->ttime = bfqq_data->saved_ttime; + bfqq->io_start_time = bfqq_data->saved_io_start_time; + bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time; /* * Restore weight coefficient only if low_latency is on */ if (bfqd->low_latency) { old_wr_coeff = bfqq->wr_coeff; - bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_coeff = bfqq_data->saved_wr_coeff; } - bfqq->service_from_wr = bic->saved_service_from_wr; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + bfqq->service_from_wr = bfqq_data->saved_service_from_wr; + bfqq->wr_start_at_switch_to_srt = + bfqq_data->saved_wr_start_at_switch_to_srt; + bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time; if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + @@ -1818,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +/* + * Get the index of the actuator that will serve bio. + */ +static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) +{ + unsigned int i; + sector_t end; + + /* no search needed if one or zero ranges present */ + if (bfqd->num_actuators == 1) + return 0; + + /* bio_end_sector(bio) gives the sector after the last one */ + end = bio_end_sector(bio) - 1; + + for (i = 0; i < bfqd->num_actuators; i++) { + if (end >= bfqd->sector[i] && + end < bfqd->sector[i] + bfqd->nr_sectors[i]) + return i; + } + + WARN_ONCE(true, + "bfq_actuator_index: bio sector out of ranges: end=%llu\n", + end); + return 0; +} + static bool bfq_better_to_idle(struct bfq_queue *bfqq); static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, @@ -1837,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, arrived_in_time = ktime_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; - + unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); + bool bfqq_non_merged_or_stably_merged = + bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged; /* * bfqq deserves to be weight-raised if: @@ -1871,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || - (bfq_bfqq_sync(bfqq) && - (bfqq->bic || RQ_BIC(rq)->stably_merged) && - (*interactive || soft_rt))); + (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq @@ -2141,14 +2136,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC) + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq || + bfqq == &bfqd->oom_bfqq) return; /* * We reset waker detection logic also if too much time has passed * since the first detection. If wakeups are rare, pointless idling * doesn't hurt throughput that much. The condition below makes sure - * we do not uselessly idle blocking waker in more than 1/64 cases. + * we do not uselessly idle blocking waker in more than 1/64 cases. */ if (bfqd->last_completed_rq_bfqq != bfqq->tentative_waker_bfqq || @@ -2259,9 +2256,9 @@ static void bfq_add_request(struct request *rq) * elapsed. */ if (bfqq == bfqd->in_service_queue && - (bfqd->rq_in_driver == 0 || + (bfqd->tot_rq_in_driver == 0 || (bfqq->last_serv_time_ns > 0 && - bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && + bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); @@ -2285,7 +2282,7 @@ static void bfq_add_request(struct request *rq) * will be set in case injection is performed * on bfqq before rq is completed). */ - if (bfqd->rq_in_driver == 0) + if (bfqd->tot_rq_in_driver == 0) bfqd->rqs_injected = false; } } @@ -2379,22 +2376,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) return 0; } -#if 0 /* Still not clear if we can do without next two functions */ -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver--; -} -#endif - static void bfq_remove_request(struct request_queue *q, struct request *rq) { @@ -2484,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, */ bfq_bic_update_cgroup(bic, bio); - bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf), + bfq_actuator_index(bfqd, bio)); } else { bfqd->bio_bfqq = NULL; } @@ -2650,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_NR_LEVELS; j++) - if (bfqg->async_bfqq[i][j]) - bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) - bfq_bfqq_end_wr(bfqg->async_idle_bfqq); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + if (bfqg->async_bfqq[i][j][k]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]); + if (bfqg->async_idle_bfqq[k]) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]); + } } static void bfq_end_wr(struct bfq_data *bfqd) { struct bfq_queue *bfqq; + int i; spin_lock_irq(&bfqd->lock); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + bfq_bfqq_end_wr(bfqq); + } list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) bfq_bfqq_end_wr(bfqq); bfq_end_wr_async(bfqd); @@ -2860,6 +2847,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq); +static struct bfq_queue * +bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_queue *stable_merge_bfqq, + struct bfq_iocq_bfqq_data *bfqq_data) +{ + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + struct bfq_queue *new_bfqq; + + if (idling_boosts_thr_without_issues(bfqd, bfqq) || + proc_ref == 0) + return NULL; + + /* next function will take at least one ref */ + new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); + + if (new_bfqq) { + bfqq_data->stably_merged = true; + if (new_bfqq->bic) { + unsigned int new_a_idx = new_bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *new_bfqq_data = + &new_bfqq->bic->bfqq_data[new_a_idx]; + + new_bfqq_data->stably_merged = true; + } + } + return new_bfqq; +} + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2885,6 +2901,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* if a merge has already been setup, then proceed with that first */ if (bfqq->new_bfqq) @@ -2906,37 +2924,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * stable merging) also if bic is associated with a * sync queue, but this bfqq is async */ - if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && + if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(bfq_late_stable_merging)) && time_is_before_jiffies(bfqq->creation_time + msecs_to_jiffies(bfq_late_stable_merging))) { struct bfq_queue *stable_merge_bfqq = - bic->stable_merge_bfqq; - int proc_ref = min(bfqq_process_refs(bfqq), - bfqq_process_refs(stable_merge_bfqq)); + bfqq_data->stable_merge_bfqq; /* deschedule stable merge, because done or aborted here */ bfq_put_stable_ref(stable_merge_bfqq); - bic->stable_merge_bfqq = NULL; - - if (!idling_boosts_thr_without_issues(bfqd, bfqq) && - proc_ref > 0) { - /* next function will take at least one ref */ - struct bfq_queue *new_bfqq = - bfq_setup_merge(bfqq, stable_merge_bfqq); - - if (new_bfqq) { - bic->stably_merged = true; - if (new_bfqq->bic) - new_bfqq->bic->stably_merged = - true; - } - return new_bfqq; - } else - return NULL; + bfqq_data->stable_merge_bfqq = NULL; + + return bfq_setup_stable_merge(bfqd, bfqq, + stable_merge_bfqq, + bfqq_data); } } @@ -3031,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { struct bfq_io_cq *bic = bfqq->bic; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* * If !bfqq->bic, the queue is already shared or its requests @@ -3040,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; - bic->saved_inject_limit = bfqq->inject_limit; - bic->saved_decrease_time_jif = bfqq->decrease_time_jif; - - bic->saved_weight = bfqq->entity.orig_weight; - bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_io_start_time = bfqq->io_start_time; - bic->saved_tot_idle_time = bfqq->tot_idle_time; - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bfqq_data->saved_inject_limit = bfqq->inject_limit; + bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif; + + bfqq_data->saved_weight = bfqq->entity.orig_weight; + bfqq_data->saved_ttime = bfqq->ttime; + bfqq_data->saved_has_short_ttime = + bfq_bfqq_has_short_ttime(bfqq); + bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq_data->saved_io_start_time = bfqq->io_start_time; + bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time; + bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq_data->was_in_burst_list = + !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && !bfq_bfqq_in_large_burst(bfqq) && bfqq->bfqd->low_latency)) { @@ -3064,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) * to bfqq, so that to avoid that bfqq unjustly fails * to enjoy weight raising if split soon. */ - bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); - bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); - bic->saved_last_wr_start_finish = jiffies; + bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = + bfq_smallest_from_now(); + bfqq_data->saved_wr_cur_max_time = + bfq_wr_duration(bfqq->bfqd); + bfqq_data->saved_last_wr_start_finish = jiffies; } else { - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = + bfqq_data->saved_wr_coeff = bfqq->wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_service_from_wr = bfqq->service_from_wr; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + bfqq_data->saved_service_from_wr = + bfqq->service_from_wr; + bfqq_data->saved_last_wr_start_finish = + bfqq->last_wr_start_finish; + bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } } @@ -3180,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, /* * Merge queues (that is, let bic redirect its requests to new_bfqq) */ - bic_set_bfqq(bic, new_bfqq, true); + bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx); bfq_mark_bfqq_coop(new_bfqq); /* * new_bfqq now belongs to at least two bics (it is a shared queue): @@ -3598,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) * - start a new observation interval with this dispatch */ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) + bfqd->tot_rq_in_driver == 0) goto update_rate_and_reset; /* Update sampling information */ bfqd->peak_rate_samples++; - if ((bfqd->rq_in_driver > 0 || + if ((bfqd->tot_rq_in_driver > 0 || now_ns - bfqd->last_completion < BFQ_MIN_TT) && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) bfqd->sequential_samples++; @@ -3869,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, return false; return (bfqq->wr_coeff > 1 && - (bfqd->wr_busy_queues < - tot_busy_queues || - bfqd->rq_in_driver >= - bfqq->dispatched + 4)) || + (bfqd->wr_busy_queues < tot_busy_queues || + bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) || bfq_asymmetric_scenario(bfqd, bfqq) || tot_busy_queues == 1; } @@ -4138,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * function to evaluate the I/O speed of a process. */ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason, - unsigned long *delta_ms) + bool compensate, unsigned long *delta_ms) { ktime_t delta_ktime; u32 delta_usecs; @@ -4335,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, /* * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta); /* * As above explained, charge slow (typically seeky) and @@ -4643,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) { struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; unsigned int limit = in_serv_bfqq->inject_limit; + int i; + /* * If * - bfqq is not weight-raised and therefore does not carry @@ -4674,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ) limit = 1; - if (bfqd->rq_in_driver >= limit) + if (bfqd->tot_rq_in_driver >= limit) return NULL; /* @@ -4689,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * (and re-added only if it gets new requests, but then it * is assigned again enough budget for its new backlog). */ - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - if (!RB_EMPTY_ROOT(&bfqq->sort_list) && - (in_serv_always_inject || bfqq->wr_coeff > 1) && - bfq_serv_to_charge(bfqq->next_rq, bfqq) <= - bfq_bfqq_budget_left(bfqq)) { + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + (in_serv_always_inject || bfqq->wr_coeff > 1) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { /* * Allow for only one large in-flight request * on non-rotational devices, for the @@ -4713,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) */ if (blk_queue_nonrot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= - BFQQ_SECT_THR_NONROT) - limit = min_t(unsigned int, 1, limit); - else - limit = in_serv_bfqq->inject_limit; - - if (bfqd->rq_in_driver < limit) { + BFQQ_SECT_THR_NONROT && + bfqd->tot_rq_in_driver >= 1) + continue; + else { bfqd->rqs_injected = true; return bfqq; } } + } + + return NULL; +} + +static struct bfq_queue * +bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) +{ + struct bfq_queue *bfqq; + + if (bfqd->in_service_queue && + bfqd->in_service_queue->actuator_idx == idx) + return bfqd->in_service_queue; + + list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) { + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { + return bfqq; + } + } + + return NULL; +} + +/* + * Perform a linear scan of each actuator, until an actuator is found + * for which the following three conditions hold: the load of the + * actuator is below the threshold (see comments on + * actuator_load_threshold for details) and lower than that of the + * next actuator (comments on this extra condition below), and there + * is a queue that contains I/O for that actuator. On success, return + * that queue. + * + * Performing a plain linear scan entails a prioritization among + * actuators. The extra condition above breaks this prioritization and + * tends to distribute injection uniformly across actuators. + */ +static struct bfq_queue * +bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) +{ + int i; + + for (i = 0 ; i < bfqd->num_actuators; i++) { + if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold && + (i == bfqd->num_actuators - 1 || + bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) { + struct bfq_queue *bfqq = + bfq_find_active_bfqq_for_actuator(bfqd, i); + + if (bfqq) + return bfqq; + } + } return NULL; } + /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. */ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq, *inject_bfqq; struct request *next_rq; enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; @@ -4755,6 +4821,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) goto expire; check_queue: + /* + * If some actuator is underutilized, but the in-service + * queue does not contain I/O for that actuator, then try to + * inject I/O for that actuator. + */ + inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd); + if (inject_bfqq && inject_bfqq != bfqq) + return inject_bfqq; + /* * This loop is rarely executed more than once. Even when it * happens, it is much more convenient to re-execute this loop @@ -4814,11 +4889,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { - struct bfq_queue *async_bfqq = - bfqq->bic && bfqq->bic->bfqq[0] && - bfq_bfqq_busy(bfqq->bic->bfqq[0]) && - bfqq->bic->bfqq[0]->next_rq ? - bfqq->bic->bfqq[0] : NULL; + unsigned int act_idx = bfqq->actuator_idx; + struct bfq_queue *async_bfqq = NULL; struct bfq_queue *blocked_bfqq = !hlist_empty(&bfqq->woken_list) ? container_of(bfqq->woken_list.first, @@ -4826,6 +4898,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) woken_list_node) : NULL; + if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] && + bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) && + bfqq->bic->bfqq[0][act_idx]->next_rq) + async_bfqq = bfqq->bic->bfqq[0][act_idx]; /* * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to @@ -4910,7 +4986,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) - bfqq = bfqq->bic->bfqq[0]; + bfqq = async_bfqq; else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && bfqq->waker_bfqq->next_rq && @@ -5041,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_dispatch_remove(bfqd->queue, rq); if (bfqq != bfqd->in_service_queue) - goto return_rq; + return rq; /* * If weight raising has to terminate for bfqq, then next @@ -5061,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) - goto return_rq; - - bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); -return_rq: return rq; } @@ -5109,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * We exploit the bfq_finish_requeue_request hook to - * decrement rq_in_driver, but + * decrement tot_rq_in_driver, but * bfq_finish_requeue_request will not be invoked on * this request. So, to avoid unbalance, just start - * this request, without incrementing rq_in_driver. As - * a negative consequence, rq_in_driver is deceptively + * this request, without incrementing tot_rq_in_driver. As + * a negative consequence, tot_rq_in_driver is deceptively * lower than it should be while this request is in * service. This may cause bfq_schedule_dispatch to be * invoked uselessly. @@ -5122,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * bfq_finish_requeue_request hook, if defined, is * probably invoked also on this request. So, by * exploiting this hook, we could 1) increment - * rq_in_driver here, and 2) decrement it in + * tot_rq_in_driver here, and 2) decrement it in * bfq_finish_requeue_request. Such a solution would * let the value of the counter be always accurate, * but it would entail using an extra interface @@ -5151,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * Of course, serving one request at a time may cause loss of * throughput. */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0) goto exit; bfqq = bfq_select_queue(bfqd); @@ -5162,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) if (rq) { inc_in_driver_start_rq: - bfqd->rq_in_driver++; + bfqd->rq_in_driver[bfqq->actuator_idx]++; + bfqd->tot_rq_in_driver++; start_rq: rq->rq_flags |= RQF_STARTED; } @@ -5349,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) */ __bfqq = bfqq->new_bfqq; while (__bfqq) { - if (__bfqq == bfqq) - break; next = __bfqq->new_bfqq; bfq_put_queue(__bfqq); __bfqq = next; @@ -5371,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_release_process_ref(bfqd, bfqq); } -static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) +static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx); struct bfq_data *bfqd; if (bfqq) bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ if (bfqq && bfqd) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - bic_set_bfqq(bic, NULL, is_sync); + bic_set_bfqq(bic, NULL, is_sync, actuator_idx); bfq_exit_bfqq(bfqd, bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); } } static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + unsigned long flags; + unsigned int act_idx; + /* + * If bfqd and thus bfqd->num_actuators is not available any + * longer, then cycle over all possible per-actuator bfqqs in + * next loop. We rely on bic being zeroed on creation, and + * therefore on its unused per-actuator fields being NULL. + */ + unsigned int num_actuators = BFQ_MAX_ACTUATORS; + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - if (bic->stable_merge_bfqq) { - struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + /* + * bfqd is NULL if scheduler already exited, and in that case + * this is the last time these queues are accessed. + */ + if (bfqd) { + spin_lock_irqsave(&bfqd->lock, flags); + num_actuators = bfqd->num_actuators; + } - /* - * bfqd is NULL if scheduler already exited, and in - * that case this is the last time bfqq is accessed. - */ - if (bfqd) { - unsigned long flags; + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - spin_lock_irqsave(&bfqd->lock, flags); - bfq_put_stable_ref(bic->stable_merge_bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - bfq_put_stable_ref(bic->stable_merge_bfqq); - } + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); } - bfq_exit_icq_bfqq(bic, true); - bfq_exit_icq_bfqq(bic, false); + if (bfqd) + spin_unlock_irqrestore(&bfqd->lock, flags); } /* @@ -5489,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bic->ioprio = ioprio; - bfqq = bic_to_bfqq(bic, false); + bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio)); if (bfqq) { struct bfq_queue *old_bfqq = bfqq; bfqq = bfq_get_queue(bfqd, bio, false, bic, true); - bic_set_bfqq(bic, bfqq, false); + bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio)); bfq_release_process_ref(bfqd, old_bfqq); } - bfqq = bic_to_bfqq(bic, true); + bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio)); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, pid_t pid, int is_sync) + struct bfq_io_cq *bic, pid_t pid, int is_sync, + unsigned int act_idx) { u64 now_ns = ktime_get_ns(); + bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5567,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* first request is almost certainly seeky */ bfqq->seek_history = 1; + + bfqq->decrease_time_jif = jiffies; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, struct bfq_group *bfqg, - int ioprio_class, int ioprio) + int ioprio_class, int ioprio, int act_idx) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; + return &bfqg->async_bfqq[0][ioprio][act_idx]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; + return &bfqg->async_bfqq[1][ioprio][act_idx]; case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; + return &bfqg->async_idle_bfqq[act_idx]; default: return NULL; } @@ -5593,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, struct bfq_queue *last_bfqq_created) { + unsigned int a_idx = last_bfqq_created->actuator_idx; struct bfq_queue *new_bfqq = bfq_setup_merge(bfqq, last_bfqq_created); @@ -5600,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, return bfqq; if (new_bfqq->bic) - new_bfqq->bic->stably_merged = true; - bic->stably_merged = true; + new_bfqq->bic->bfqq_data[a_idx].stably_merged = true; + bic->bfqq_data[a_idx].stably_merged = true; /* * Reusing merge functions. This implies that @@ -5676,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, * it has been set already, but too long ago, then move it * forward to bfqq. Finally, move also if bfqq belongs to a * different group than last_bfqq_created, or if bfqq has a - * different ioprio or ioprio_class. If none of these - * conditions holds true, then try an early stable merge or - * schedule a delayed stable merge. + * different ioprio, ioprio_class or actuator_idx. If none of + * these conditions holds true, then try an early stable merge + * or schedule a delayed stable merge. As for the condition on + * actuator_idx, the reason is that, if queues associated with + * different actuators are merged, then control is lost on + * each actuator. Therefore some actuator may be + * underutilized, and throughput may decrease. * * A delayed merge is scheduled (instead of performing an * early merge), in case bfqq might soon prove to be more @@ -5696,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, bfqq->creation_time) || bfqq->entity.parent != last_bfqq_created->entity.parent || bfqq->ioprio != last_bfqq_created->ioprio || - bfqq->ioprio_class != last_bfqq_created->ioprio_class) + bfqq->ioprio_class != last_bfqq_created->ioprio_class || + bfqq->actuator_idx != last_bfqq_created->actuator_idx) *source_bfqq = bfqq; else if (time_after_eq(last_bfqq_created->creation_time + bfqd->bfq_burst_interval, @@ -5726,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, /* * Record the bfqq to merge to. */ - bic->stable_merge_bfqq = last_bfqq_created; + bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq = + last_bfqq_created; } } @@ -5748,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); + ioprio, + bfq_actuator_index(bfqd, bio)); bfqq = *async_bfqq; if (bfqq) goto out; @@ -5760,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, if (bfqq) { bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); + is_sync, bfq_actuator_index(bfqd, bio)); bfq_init_entity(&bfqq->entity, bfqg); bfq_log_bfqq(bfqd, bfqq, "allocated"); } else { @@ -6075,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * then complete the merge and redirect it to * new_bfqq. */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + if (bic_to_bfqq(RQ_BIC(rq), true, + bfq_actuator_index(bfqd, rq->bio)) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); @@ -6213,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) struct bfq_queue *bfqq = bfqd->in_service_queue; bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->tot_rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -6224,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) * sum is not exact, as it's not taking into account deactivated * requests. */ - if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) + if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) return; /* @@ -6235,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < BFQ_HW_QUEUE_THRESHOLD && - bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) return; if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) @@ -6256,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_update_hw_tag(bfqd); - bfqd->rq_in_driver--; + bfqd->rq_in_driver[bfqq->actuator_idx]--; + bfqd->tot_rq_in_driver--; bfqq->dispatched--; if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { @@ -6268,7 +6361,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, bfqq); + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + bfq_weights_tree_remove(bfqq); } now_ns = ktime_get_ns(); @@ -6375,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) BFQQE_NO_MORE_REQUESTS); } - if (!bfqd->rq_in_driver) + if (!bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); } @@ -6506,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, * conditions to do it, or we can lower the last base value * computed. * - * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O + * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O * request in flight, because this function is in the code * path that handles the completion of a request of bfqq, and, * in particular, this function is executed before - * bfqd->rq_in_driver is decremented in such a code path. + * bfqd->tot_rq_in_driver is decremented in such a code path. */ - if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || + if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { if (bfqq->last_serv_time_ns == 0) { /* @@ -6522,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } bfqq->last_serv_time_ns = tot_time_ns; - } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) + } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1) /* * No I/O injected and no request still in service in * the drive: these are the exact conditions for @@ -6629,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return bfqq; } - bic_set_bfqq(bic, NULL, true); + bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); @@ -6643,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bool split, bool is_sync, bool *new_queue) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + unsigned int act_idx = bfq_actuator_index(bfqd, bio); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx]; if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) return bfqq; @@ -6655,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bfq_put_queue(bfqq); bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); - bic_set_bfqq(bic, bfqq, is_sync); + bic_set_bfqq(bic, bfqq, is_sync, act_idx); if (split && is_sync) { - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + if ((bfqq_data->was_in_burst_list && bfqd->large_burst) || + bfqq_data->saved_in_large_burst) bfq_mark_bfqq_in_large_burst(bfqq); else { bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) + if (bfqq_data->was_in_burst_list) /* * If bfqq was in the current * burst list before being @@ -6751,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_queue *bfqq; bool new_queue = false; bool bfqq_already_existing = false, split = false; + unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) return NULL; /* - * Assuming that elv.priv[1] is set only if everything is set + * Assuming that RQ_BFQQ(rq) is set only if everything is set * for this rq. This holds true, because this function is * invoked only for insertion or merging, and, after such * events, a request cannot be manipulated any longer before * being removed from bfq. */ - if (rq->elv.priv[1]) - return rq->elv.priv[1]; + if (RQ_BFQQ(rq)) + return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); @@ -6777,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && - !bic->stably_merged) { + !bic->bfqq_data[a_idx].stably_merged) { struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) - bic->saved_in_large_burst = true; + bic->bfqq_data[a_idx].saved_in_large_burst = + true; bfqq = bfq_split_bfqq(bic, bfqq); split = true; @@ -6965,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, */ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_NR_LEVELS; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]); - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]); + } } /* @@ -7082,6 +7182,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { struct bfq_data *bfqd; struct elevator_queue *eq; + unsigned int i; + struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; eq = elevator_alloc(q, e); if (!eq) @@ -7102,8 +7204,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. + * Set zero as actuator index: we will pretend that + * all I/O requests are for the same actuator. */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0); bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; @@ -7122,6 +7226,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->queue = q; + bfqd->num_actuators = 1; + /* + * If the disk supports multiple actuators, copy independent + * access ranges from the request queue structure. + */ + spin_lock_irq(&q->queue_lock); + if (ia_ranges) { + /* + * Check if the disk ia_ranges size exceeds the current bfq + * actuator limit. + */ + if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) { + pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n", + ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS); + pr_crit("Falling back to single actuator mode.\n"); + } else { + bfqd->num_actuators = ia_ranges->nr_ia_ranges; + + for (i = 0; i < bfqd->num_actuators; i++) { + bfqd->sector[i] = ia_ranges->ia_range[i].sector; + bfqd->nr_sectors[i] = + ia_ranges->ia_range[i].nr_sectors; + } + } + } + + /* Otherwise use single-actuator dev info */ + if (bfqd->num_actuators == 1) { + bfqd->sector[0] = 0; + bfqd->nr_sectors[0] = get_capacity(q->disk); + } + spin_unlock_irq(&q->queue_lock); + INIT_LIST_HEAD(&bfqd->dispatch); hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, @@ -7129,9 +7266,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->queue_weights_tree = RB_ROOT_CACHED; +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqd->num_groups_with_pending_reqs = 0; +#endif - INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->active_list[0]); + INIT_LIST_HEAD(&bfqd->active_list[1]); INIT_LIST_HEAD(&bfqd->idle_list); INIT_HLIST_HEAD(&bfqd->burst_list); @@ -7157,7 +7297,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) */ bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); bfqd->bfq_wr_max_softrt_rate = 7000; /* @@ -7176,6 +7315,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + spin_lock_init(&bfqd->lock); /* @@ -7473,6 +7615,7 @@ MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { int ret; + char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.1"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -7504,6 +7647,11 @@ static int __init bfq_init(void) if (ret) goto slab_kill; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); + return 0; slab_kill: diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 71f721670ab6..75cc6a324267 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -33,6 +33,14 @@ */ #define BFQ_SOFTRT_WEIGHT_FACTOR 100 +/* + * Maximum number of actuators supported. This constant is used simply + * to define the size of the static array that will contain + * per-actuator data. The current value is hopefully a good upper + * bound to the possible number of actuators of any actual drive. + */ +#define BFQ_MAX_ACTUATORS 8 + struct bfq_entity; /** @@ -197,8 +205,10 @@ struct bfq_entity { /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* flag, set if the entity is counted in groups_with_pending_reqs */ bool in_groups_with_pending_reqs; +#endif /* last child queue of entity created (for non-leaf entities) */ struct bfq_queue *last_bfqq_created; @@ -225,12 +235,14 @@ struct bfq_ttime { * struct bfq_queue - leaf schedulable entity. * * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating - * processes. @cgroup holds a reference to the cgroup, to be sure that it - * does not disappear while a bfqq still references it (mostly to avoid - * races between request issuing and task migration followed by cgroup - * destruction). - * All the fields are protected by the queue lock of the containing bfqd. + * io_context or more, if it is async or shared between cooperating + * processes. Besides, it contains I/O requests for only one actuator + * (an io_context is associated with a different bfq_queue for each + * actuator it generates I/O for). @cgroup holds a reference to the + * cgroup, to be sure that it does not disappear while a bfqq still + * references it (mostly to avoid races between request issuing and + * task migration followed by cgroup destruction). All the fields are + * protected by the queue lock of the containing bfqd. */ struct bfq_queue { /* reference counter */ @@ -395,24 +407,18 @@ struct bfq_queue { * the woken queues when this queue exits. */ struct hlist_head woken_list; + + /* index of the actuator this queue is associated with */ + unsigned int actuator_idx; }; /** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ -struct bfq_io_cq { - /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; - /* per (request_queue, blkcg) ioprio */ - int ioprio; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif +* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq +*/ +struct bfq_iocq_bfqq_data { /* * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to + * to remember its values while the queue is merged, so as to * be able to restore it in case of split. */ bool saved_has_short_ttime; @@ -426,7 +432,7 @@ struct bfq_io_cq { u64 saved_tot_idle_time; /* - * Same purpose as the previous fields for the value of the + * Same purpose as the previous fields for the values of the * field keeping the queue's belonging to a large burst */ bool saved_in_large_burst; @@ -464,6 +470,38 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* + * Matrix of associated process queues: first row for async + * queues, second row sync queues. Each row contains one + * column for each actuator. An I/O request generated by the + * process is inserted into the queue pointed by bfqq[i][j] if + * the request is to be served by the j-th actuator of the + * drive, where i==0 or i==1, depending on whether the request + * is async or sync. So there is a distinct queue for each + * actuator. + */ + struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + + /* + * Persistent data for associated synchronous process queues + * (one queue per actuator, see field bfqq above). In + * particular, each of these queues may undergo a merge. + */ + struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; + unsigned int requests; /* Number of requests this process has in flight */ }; @@ -491,28 +529,29 @@ struct bfq_data { */ struct rb_root_cached queue_weights_tree; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* - * Number of groups with at least one descendant process that + * Number of groups with at least one process that * has at least one request waiting for completion. Note that * this accounts for also requests already dispatched, but not * yet completed. Therefore this number of groups may differ * (be larger) than the number of active groups, as a group is * considered active only if its corresponding entity has - * descendant queues with at least one request queued. This + * queues with at least one request queued. This * number is used to decide whether a scenario is symmetric. * For a detailed explanation see comments on the computation * of the variable asymmetric_scenario in the function * bfq_better_to_idle(). * * However, it is hard to compute this number exactly, for - * groups with multiple descendant processes. Consider a group - * that is inactive, i.e., that has no descendant process with + * groups with multiple processes. Consider a group + * that is inactive, i.e., that has no process with * pending I/O inside BFQ queues. Then suppose that * num_groups_with_pending_reqs is still accounting for this - * group, because the group has descendant processes with some + * group, because the group has processes with some * I/O request still in flight. num_groups_with_pending_reqs * should be decremented when the in-flight request of the - * last descendant process is finally completed (assuming that + * last process is finally completed (assuming that * nothing else has changed for the group in the meantime, in * terms of composition of the group and active/inactive state of child * groups and processes). To accomplish this, an additional @@ -521,7 +560,7 @@ struct bfq_data { * we resort to the following tradeoff between simplicity and * accuracy: for an inactive group that is still counted in * num_groups_with_pending_reqs, we decrement - * num_groups_with_pending_reqs when the first descendant + * num_groups_with_pending_reqs when the first * process of the group remains with no request waiting for * completion. * @@ -529,15 +568,16 @@ struct bfq_data { * carefulness: to avoid multiple decrements, we flag a group, * more precisely an entity representing a group, as still * counted in num_groups_with_pending_reqs when it becomes - * inactive. Then, when the first descendant queue of the + * inactive. Then, when the first queue of the * entity remains with no request waiting for completion, * num_groups_with_pending_reqs is decremented, and this flag * is reset. After this flag is reset for the entity, * num_groups_with_pending_reqs won't be decremented any - * longer in case a new descendant queue of the entity remains + * longer in case a new queue of the entity remains * with no request waiting for completion. */ unsigned int num_groups_with_pending_reqs; +#endif /* * Per-class (RT, BE, IDLE) number of bfq_queues containing @@ -550,7 +590,12 @@ struct bfq_data { /* number of queued requests */ int queued; /* number of requests dispatched and waiting for completion */ - int rq_in_driver; + int tot_rq_in_driver; + /* + * number of requests dispatched and waiting for completion + * for each actuator + */ + int rq_in_driver[BFQ_MAX_ACTUATORS]; /* true if the device is non rotational and performs queueing */ bool nonrot_with_queueing; @@ -644,8 +689,13 @@ struct bfq_data { /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; - /* list of all the bfq_queues active on the device */ - struct list_head active_list; + /* + * List of all the bfq_queues active for a specific actuator + * on the device. Keeping active queues separate on a + * per-actuator basis helps implementing per-actuator + * injection more efficiently. + */ + struct list_head active_list[BFQ_MAX_ACTUATORS]; /* list of all the bfq_queues idle on the device */ struct list_head idle_list; @@ -719,8 +769,6 @@ struct bfq_data { * is multiplied. */ unsigned int bfq_wr_coeff; - /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; @@ -768,6 +816,42 @@ struct bfq_data { */ unsigned int word_depths[2][2]; unsigned int full_depth_shift; + + /* + * Number of independent actuators. This is equal to 1 in + * case of single-actuator drives. + */ + unsigned int num_actuators; + /* + * Disk independent access ranges for each actuator + * in this device. + */ + sector_t sector[BFQ_MAX_ACTUATORS]; + sector_t nr_sectors[BFQ_MAX_ACTUATORS]; + struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; + + /* + * If the number of I/O requests queued in the device for a + * given actuator is below next threshold, then the actuator + * is deemed as underutilized. If this condition is found to + * hold for some actuator upon a dispatch, but (i) the + * in-service queue does not contain I/O for that actuator, + * while (ii) some other queue does contain I/O for that + * actuator, then the head I/O request of the latter queue is + * returned (injected), instead of the head request of the + * currently in-service queue. + * + * We set the threshold, empirically, to the minimum possible + * value for which an actuator is fully utilized, or close to + * be fully utilized. By doing so, injected I/O 'steals' as + * few drive-queue slots as possibile to the in-service + * queue. This reduces as much as possible the probability + * that the service of I/O from the in-service bfq_queue gets + * delayed because of slot exhaustion, i.e., because all the + * slots of the drive queue are filled with I/O injected from + * other queues (NCQ provides for 32 slots). + */ + unsigned int actuator_load_threshold; }; enum bfqq_state_flags { @@ -924,21 +1008,22 @@ struct bfq_group { char blkg_path[128]; /* reference counter (see comments in bfq_bic_update_cgroup) */ - int ref; + refcount_t ref; /* Is bfq_group still online? */ bool online; struct bfq_entity entity; struct bfq_sched_data sched_data; - void *bfqd; + struct bfq_data *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct bfq_entity *my_entity; int active_entities; + int num_queues_with_pending_reqs; struct rb_root rq_pos_tree; @@ -950,8 +1035,8 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct rb_root rq_pos_tree; }; @@ -964,17 +1049,14 @@ struct bfq_group { extern const int bfq_timeout; -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx); +void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, + unsigned int actuator_idx); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root); -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq); +void bfq_weights_tree_add(struct bfq_queue *bfqq); +void bfq_weights_tree_remove(struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); @@ -1078,6 +1160,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq); +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 8fc3da4c23bb..7941b6f07391 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -218,6 +218,24 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return false; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + + if (bfqg != bfqg->bfqd->root_group) + bfqg->active_entities++; +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + + if (bfqg != bfqg->bfqd->root_group) + bfqg->active_entities--; +} + #else /* CONFIG_BFQ_GROUP_IOSCHED */ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) @@ -230,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return true; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ +} + #endif /* CONFIG_BFQ_GROUP_IOSCHED */ /* @@ -456,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif bfq_insert(&st->active, entity); @@ -471,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities++; -#endif + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]); + + bfq_inc_active_entities(entity); } /** @@ -558,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif node = bfq_find_deepest(&entity->rb_node); bfq_extract(&st->active, entity); if (node) bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities--; -#endif + + bfq_dec_active_entities(entity); } /** @@ -706,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root_cached *root; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; -#endif - - if (bfqq) - bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; - } -#endif /* Matches the smp_wmb() in bfq_group_set_weight. */ smp_rmb(); @@ -770,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * queue, remove the entity from its old weight counter (if * there is a counter associated with the entity). */ - if (prev_weight != new_weight && bfqq) { - root = &bfqd->queue_weights_tree; - __bfq_weights_tree_remove(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq) + bfq_weights_tree_remove(bfqq); entity->weight = new_weight; /* * Add the entity, if it is not a weight-raised queue, * to the counter associated with its new weight. */ - if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { - /* If we get here, root has been initialized. */ - bfq_weights_tree_add(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqq); new_st->wsum += entity->weight; @@ -984,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st_or_in_serv = true; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - struct bfq_data *bfqd = bfqg->bfqd; - - if (!entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = true; - bfqd->num_groups_with_pending_reqs++; - } - } -#endif - bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1082,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) } static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - struct bfq_sched_data *sd, bool non_blocking_wait_rq) { struct bfq_service_tree *st = bfq_entity_service_tree(entity); - if (sd->in_service_entity == entity || entity->tree == &st->active) + if (entity->sched_data->in_service_entity == entity || + entity->tree == &st->active) /* * in service or already queued on the active tree, * requeue or reposition @@ -1119,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, bool non_blocking_wait_rq, bool requeue, bool expiration) { - struct bfq_sched_data *sd; - for_each_entity(entity) { - sd = entity->sched_data; - __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - - if (!bfq_update_next_in_service(sd, entity, expiration) && - !requeue) + __bfq_activate_requeue_entity(entity, non_blocking_wait_rq); + if (!bfq_update_next_in_service(entity->sched_data, entity, + expiration) && !requeue) break; } } @@ -1646,6 +1610,32 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq == bfqd->in_service_queue, expiration); } +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_entity *entity = &bfqq->entity; + + if (!entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = true; + if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++)) + bfqq->bfqd->num_groups_with_pending_reqs++; + } +#endif +} + +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_entity *entity = &bfqq->entity; + + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs)) + bfqq->bfqd->num_groups_with_pending_reqs--; + } +#endif +} + /* * Called when the bfqq no longer has requests pending, remove it from * the service tree. As a special case, it can be invoked during an @@ -1668,8 +1658,14 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); - if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, bfqq); + if (!bfqq->dispatched) { + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + /* + * Next function is invoked last, because it causes bfqq to be + * freed. DO NOT use bfqq after the next function invocation. + */ + bfq_weights_tree_remove(bfqq); + } } /* @@ -1686,10 +1682,11 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues[bfqq->ioprio_class - 1]++; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, bfqq, - &bfqd->queue_weights_tree); + bfq_weights_tree_add(bfqq); + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; -- 2.40.0 From d3a08dfc47f09c8d759585ccabed43a94b349937 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 14 Feb 2023 20:39:48 +0100 Subject: [PATCH 03/17] bitmap Signed-off-by: Peter Jung --- include/linux/bitmap.h | 46 ++++++------- include/linux/cpumask.h | 144 +++++++++++++++++++-------------------- include/linux/find.h | 40 +++++------ include/linux/nodemask.h | 86 +++++++++++------------ 4 files changed, 158 insertions(+), 158 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7d6d73b78147..40e53a2ecc0d 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -189,7 +189,7 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ -static inline unsigned long +static __always_inline unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, @@ -237,7 +237,7 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -247,7 +247,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) memset(dst, 0, len); } -static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) +static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -257,7 +257,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) memset(dst, 0xff, len); } -static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); @@ -271,7 +271,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, /* * Copy bitmap and clear tail bits in last word. */ -static inline void bitmap_copy_clear_tail(unsigned long *dst, +static __always_inline void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); @@ -317,7 +317,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif -static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, +static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -325,7 +325,7 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, return __bitmap_and(dst, src1, src2, nbits); } -static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, +static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -334,7 +334,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } -static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, +static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -343,7 +343,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, __bitmap_xor(dst, src1, src2, nbits); } -static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, +static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -351,7 +351,7 @@ static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, return __bitmap_andnot(dst, src1, src2, nbits); } -static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -367,7 +367,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -static inline bool bitmap_equal(const unsigned long *src1, +static __always_inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -387,7 +387,7 @@ static inline bool bitmap_equal(const unsigned long *src1, * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ -static inline bool bitmap_or_equal(const unsigned long *src1, +static __always_inline bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits) @@ -398,7 +398,7 @@ static inline bool bitmap_or_equal(const unsigned long *src1, return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } -static inline bool bitmap_intersects(const unsigned long *src1, +static __always_inline bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { @@ -408,7 +408,7 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -static inline bool bitmap_subset(const unsigned long *src1, +static __always_inline bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -417,7 +417,7 @@ static inline bool bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) +static __always_inline bool bitmap_empty(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -425,7 +425,7 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) return find_first_bit(src, nbits) == nbits; } -static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) +static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); @@ -482,7 +482,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, __bitmap_clear(map, start, nbits); } -static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -491,7 +491,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s __bitmap_shift_right(dst, src, shift, nbits); } -static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, +static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -500,7 +500,7 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr __bitmap_shift_left(dst, src, shift, nbits); } -static inline void bitmap_replace(unsigned long *dst, +static __always_inline void bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, @@ -512,7 +512,7 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline void bitmap_next_set_region(unsigned long *bitmap, +static __always_inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) { @@ -563,7 +563,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ -static inline void bitmap_from_u64(unsigned long *dst, u64 mask) +static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } @@ -576,7 +576,7 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) * Returns the 8-bit value located at the @start bit offset within the @src * memory region. */ -static inline unsigned long bitmap_get_value8(const unsigned long *map, +static __always_inline unsigned long bitmap_get_value8(const unsigned long *map, unsigned long start) { const size_t index = BIT_WORD(start); @@ -591,7 +591,7 @@ static inline unsigned long bitmap_get_value8(const unsigned long *map, * @value: the 8-bit value; values wider than 8 bits may clobber bitmap * @start: bit offset of the 8-bit value; must be a multiple of 8 */ -static inline void bitmap_set_value8(unsigned long *map, unsigned long value, +static __always_inline void bitmap_set_value8(unsigned long *map, unsigned long value, unsigned long start) { const size_t index = BIT_WORD(start); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c2aa0aa26b45..9543b22d6dc2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -41,7 +41,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern unsigned int nr_cpu_ids; #endif -static inline void set_nr_cpu_ids(unsigned int nr) +static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); @@ -124,7 +124,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu) * * Returns >= nr_cpu_ids if no cpus set. */ -static inline unsigned int cpumask_first(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } @@ -135,7 +135,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) * * Returns >= nr_cpu_ids if all cpus are set. */ -static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); } @@ -147,7 +147,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ -static inline +static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); @@ -159,7 +159,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask * * Returns >= nr_cpumask_bits if no CPUs set. */ -static inline unsigned int cpumask_last(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } @@ -171,7 +171,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) * * Returns >= nr_cpu_ids if no further cpus set. */ -static inline +static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ @@ -187,7 +187,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) * * Returns >= nr_cpu_ids if no further cpus unset. */ -static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) +static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) @@ -197,18 +197,18 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ -static inline unsigned int cpumask_local_spread(unsigned int i, int node) +static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } -static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, +static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } -static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } @@ -227,7 +227,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp); * * Returns >= nr_cpu_ids if no further cpus set in both. */ -static inline +static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -259,7 +259,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits) #if NR_CPUS == 1 -static inline +static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); @@ -335,7 +335,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta * Often used to find any cpu but smp_processor_id() in a mask. * Returns >= nr_cpu_ids if no cpus set. */ -static inline +static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { unsigned int i; @@ -354,7 +354,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) * * Returns >= nr_cpu_ids if such cpu doesn't exist. */ -static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) +static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu)); } @@ -367,7 +367,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s * * Returns >= nr_cpu_ids if such cpu doesn't exist. */ -static inline +static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -383,7 +383,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, * * Returns >= nr_cpu_ids if such cpu doesn't exist. */ -static inline +static __always_inline unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -476,7 +476,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask * * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ -static inline void cpumask_setall(struct cpumask *dstp) +static __always_inline void cpumask_setall(struct cpumask *dstp) { bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } @@ -485,7 +485,7 @@ static inline void cpumask_setall(struct cpumask *dstp) * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ -static inline void cpumask_clear(struct cpumask *dstp) +static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); } @@ -498,7 +498,7 @@ static inline void cpumask_clear(struct cpumask *dstp) * * If *@dstp is empty, returns false, else returns true */ -static inline bool cpumask_and(struct cpumask *dstp, +static __always_inline bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -512,7 +512,7 @@ static inline bool cpumask_and(struct cpumask *dstp, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, +static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), @@ -525,7 +525,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_xor(struct cpumask *dstp, +static __always_inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -541,7 +541,7 @@ static inline void cpumask_xor(struct cpumask *dstp, * * If *@dstp is empty, returns false, else returns true */ -static inline bool cpumask_andnot(struct cpumask *dstp, +static __always_inline bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { @@ -554,7 +554,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp, * @dstp: the cpumask result * @srcp: the input to invert */ -static inline void cpumask_complement(struct cpumask *dstp, +static __always_inline void cpumask_complement(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), @@ -566,7 +566,7 @@ static inline void cpumask_complement(struct cpumask *dstp, * @src1p: the first input * @src2p: the second input */ -static inline bool cpumask_equal(const struct cpumask *src1p, +static __always_inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), @@ -579,7 +579,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p, * @src2p: the second input * @src3p: the third input */ -static inline bool cpumask_or_equal(const struct cpumask *src1p, +static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { @@ -592,7 +592,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p, * @src1p: the first input * @src2p: the second input */ -static inline bool cpumask_intersects(const struct cpumask *src1p, +static __always_inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), @@ -606,7 +606,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p, * * Returns true if *@src1p is a subset of *@src2p, else returns false */ -static inline bool cpumask_subset(const struct cpumask *src1p, +static __always_inline bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), @@ -617,7 +617,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p, * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. */ -static inline bool cpumask_empty(const struct cpumask *srcp) +static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); } @@ -626,7 +626,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp) * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. */ -static inline bool cpumask_full(const struct cpumask *srcp) +static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } @@ -635,7 +635,7 @@ static inline bool cpumask_full(const struct cpumask *srcp) * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. */ -static inline unsigned int cpumask_weight(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); } @@ -645,7 +645,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. */ -static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, +static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); @@ -657,7 +657,7 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, * @srcp: the input to shift * @n: the number of bits to shift by */ -static inline void cpumask_shift_right(struct cpumask *dstp, +static __always_inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, @@ -670,7 +670,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp, * @srcp: the input to shift * @n: the number of bits to shift by */ -static inline void cpumask_shift_left(struct cpumask *dstp, +static __always_inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, @@ -682,7 +682,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp, * @dstp: the result * @srcp: the input cpumask */ -static inline void cpumask_copy(struct cpumask *dstp, +static __always_inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); @@ -719,7 +719,7 @@ static inline void cpumask_copy(struct cpumask *dstp, * * Returns -errno, or 0 for success. */ -static inline int cpumask_parse_user(const char __user *buf, int len, +static __always_inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); @@ -733,7 +733,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len, * * Returns -errno, or 0 for success. */ -static inline int cpumask_parselist_user(const char __user *buf, int len, +static __always_inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), @@ -747,7 +747,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, * * Returns -errno, or 0 for success. */ -static inline int cpumask_parse(const char *buf, struct cpumask *dstp) +static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } @@ -759,7 +759,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) * * Returns -errno, or 0 for success. */ -static inline int cpulist_parse(const char *buf, struct cpumask *dstp) +static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } @@ -767,7 +767,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes */ -static inline unsigned int cpumask_size(void) +static __always_inline unsigned int cpumask_size(void) { return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); } @@ -820,7 +820,7 @@ typedef struct cpumask *cpumask_var_t; bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); -static inline +static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); @@ -836,13 +836,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) * * See alloc_cpumask_var_node. */ -static inline +static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } -static inline +static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); @@ -852,7 +852,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); -static inline bool cpumask_available(cpumask_var_t mask) +static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } @@ -863,43 +863,43 @@ typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly -static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } -static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, +static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } -static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } -static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, +static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } -static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) +static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } -static inline void free_cpumask_var(cpumask_var_t mask) +static __always_inline void free_cpumask_var(cpumask_var_t mask) { } -static inline void free_bootmem_cpumask_var(cpumask_var_t mask) +static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } -static inline bool cpumask_available(cpumask_var_t mask) +static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } @@ -929,12 +929,12 @@ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); -static inline void reset_cpu_possible_mask(void) +static __always_inline void reset_cpu_possible_mask(void) { bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); } -static inline void +static __always_inline void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -943,7 +943,7 @@ set_cpu_possible(unsigned int cpu, bool possible) cpumask_clear_cpu(cpu, &__cpu_possible_mask); } -static inline void +static __always_inline void set_cpu_present(unsigned int cpu, bool present) { if (present) @@ -954,7 +954,7 @@ set_cpu_present(unsigned int cpu, bool present) void set_cpu_online(unsigned int cpu, bool online); -static inline void +static __always_inline void set_cpu_active(unsigned int cpu, bool active) { if (active) @@ -963,7 +963,7 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } -static inline void +static __always_inline void set_cpu_dying(unsigned int cpu, bool dying) { if (dying) @@ -986,7 +986,7 @@ set_cpu_dying(unsigned int cpu, bool dying) ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) -static inline int __check_is_bitmap(const unsigned long *bitmap) +static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } @@ -1001,7 +1001,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap) extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; -static inline const struct cpumask *get_cpu_mask(unsigned int cpu) +static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; @@ -1017,7 +1017,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ -static inline unsigned int num_online_cpus(void) +static __always_inline unsigned int num_online_cpus(void) { return atomic_read(&__num_online_cpus); } @@ -1025,27 +1025,27 @@ static inline unsigned int num_online_cpus(void) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) -static inline bool cpu_online(unsigned int cpu) +static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } -static inline bool cpu_possible(unsigned int cpu) +static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } -static inline bool cpu_present(unsigned int cpu) +static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } -static inline bool cpu_active(unsigned int cpu) +static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } -static inline bool cpu_dying(unsigned int cpu) +static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } @@ -1057,27 +1057,27 @@ static inline bool cpu_dying(unsigned int cpu) #define num_present_cpus() 1U #define num_active_cpus() 1U -static inline bool cpu_online(unsigned int cpu) +static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_possible(unsigned int cpu) +static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_present(unsigned int cpu) +static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_active(unsigned int cpu) +static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_dying(unsigned int cpu) +static __always_inline bool cpu_dying(unsigned int cpu) { return false; } @@ -1111,7 +1111,7 @@ static inline bool cpu_dying(unsigned int cpu) * Returns the length of the (null-terminated) @buf string, zero if * nothing is copied. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), @@ -1134,7 +1134,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) * Returns the length of how many bytes have been copied, excluding * terminating '\0'. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { @@ -1149,7 +1149,7 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { diff --git a/include/linux/find.h b/include/linux/find.h index ccaf61a0f5fd..db2f2851601d 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -45,7 +45,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -74,7 +74,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -105,7 +105,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -134,7 +134,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ -static inline +static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -161,7 +161,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -187,7 +187,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) @@ -212,7 +212,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { @@ -239,7 +239,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { @@ -265,7 +265,7 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) @@ -289,7 +289,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -static inline +static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -310,7 +310,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) * * Returns the bit number of the last set bit, or size. */ -static inline +static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -333,7 +333,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -356,7 +356,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -373,7 +373,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ -static inline +static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { @@ -414,19 +414,19 @@ extern unsigned long find_next_clump8(unsigned long *clump, #if defined(__LITTLE_ENDIAN) -static inline unsigned long find_next_zero_bit_le(const void *addr, +static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } -static inline unsigned long find_next_bit_le(const void *addr, +static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } -static inline unsigned long find_first_zero_bit_le(const void *addr, +static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); @@ -435,7 +435,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le -static inline +static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { @@ -454,7 +454,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned #endif #ifndef find_first_zero_bit_le -static inline +static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -468,7 +468,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) #endif #ifndef find_next_bit_le -static inline +static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index efef68c9352a..4e7e7ae9dd11 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_; */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) -static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) +static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } -static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) +static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } @@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp) } #define node_clear(node, dst) __node_clear((node), &(dst)) -static inline void __node_clear(int node, volatile nodemask_t *dstp) +static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) -static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) +static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) -static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) +static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) -static inline bool __node_test_and_set(int node, nodemask_t *addr) +static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) -static inline void __nodes_complement(nodemask_t *dstp, +static __always_inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); @@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp, #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_equal(const nodemask_t *src1p, +static __always_inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); @@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p, #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_intersects(const nodemask_t *src1p, +static __always_inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); @@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p, #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_subset(const nodemask_t *src1p, +static __always_inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) -static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) +static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) -static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) +static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) -static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) +static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) -static inline void __nodes_shift_right(nodemask_t *dstp, +static __always_inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); @@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp, #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) -static inline void __nodes_shift_left(nodemask_t *dstp, +static __always_inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); @@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp, > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) -static inline unsigned int __first_node(const nodemask_t *srcp) +static __always_inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) -static inline unsigned int __next_node(int n, const nodemask_t *srcp) +static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } @@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp) * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) -static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) +static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); @@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) return ret; } -static inline void init_nodemask_of_node(nodemask_t *mask, int node) +static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); @@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node) }) #define first_unset_node(mask) __first_unset_node(&(mask)) -static inline unsigned int __first_unset_node(const nodemask_t *maskp) +static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); @@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) -static inline int __nodemask_parse_user(const char __user *buf, int len, +static __always_inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) -static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) +static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) -static inline int __node_remap(int oldbit, +static __always_inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); @@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit, #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) -static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, +static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); @@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) -static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, +static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); @@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) -static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, +static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); @@ -418,22 +418,22 @@ enum node_states { extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 -static inline int node_state(int node, enum node_states state) +static __always_inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } -static inline void node_set_state(int node, enum node_states state) +static __always_inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } -static inline void node_clear_state(int node, enum node_states state) +static __always_inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } -static inline int num_node_state(enum node_states state) +static __always_inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } @@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) -static inline unsigned int next_online_node(int nid) +static __always_inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } -static inline unsigned int next_memory_node(int nid) +static __always_inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } @@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid) extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; -static inline void node_set_online(int nid) +static __always_inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } -static inline void node_set_offline(int nid) +static __always_inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); @@ -469,20 +469,20 @@ static inline void node_set_offline(int nid) #else -static inline int node_state(int node, enum node_states state) +static __always_inline int node_state(int node, enum node_states state) { return node == 0; } -static inline void node_set_state(int node, enum node_states state) +static __always_inline void node_set_state(int node, enum node_states state) { } -static inline void node_clear_state(int node, enum node_states state) +static __always_inline void node_clear_state(int node, enum node_states state) { } -static inline int num_node_state(enum node_states state) +static __always_inline int num_node_state(enum node_states state) { return 1; } @@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state) #endif -static inline int node_random(const nodemask_t *maskp) +static __always_inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; -- 2.40.0 From cca30d98b0c2aa7af0777c4c3a890cde75500520 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sat, 11 Mar 2023 14:51:48 +0100 Subject: [PATCH 04/17] cachy Signed-off-by: Peter Jung --- .gitignore | 1 + .../admin-guide/kernel-parameters.txt | 2 +- Documentation/dontdiff | 1 + Makefile | 8 +- arch/arc/configs/axs101_defconfig | 1 + arch/arc/configs/axs103_defconfig | 1 + arch/arc/configs/axs103_smp_defconfig | 1 + arch/arc/configs/haps_hs_defconfig | 1 + arch/arc/configs/haps_hs_smp_defconfig | 1 + arch/arc/configs/hsdk_defconfig | 1 + arch/arc/configs/nsim_700_defconfig | 1 + arch/arc/configs/nsimosci_defconfig | 1 + arch/arc/configs/nsimosci_hs_defconfig | 1 + arch/arc/configs/nsimosci_hs_smp_defconfig | 1 + arch/arc/configs/tb10x_defconfig | 1 + arch/arc/configs/vdk_hs38_defconfig | 1 + arch/arc/configs/vdk_hs38_smp_defconfig | 1 + arch/x86/Kconfig.cpu | 416 ++++++++++- arch/x86/Makefile | 46 +- arch/x86/Makefile.postlink | 41 ++ arch/x86/boot/compressed/.gitignore | 1 - arch/x86/boot/compressed/Makefile | 10 +- arch/x86/include/asm/vermagic.h | 72 ++ block/elevator.c | 7 +- drivers/Makefile | 15 +- drivers/i2c/busses/Kconfig | 9 + drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-nct6775.c | 647 ++++++++++++++++++ drivers/i2c/busses/i2c-piix4.c | 4 +- drivers/md/dm-crypt.c | 5 + include/linux/pagemap.h | 2 +- include/linux/user_namespace.h | 4 + include/net/netns/ipv4.h | 1 + include/trace/events/tcp.h | 7 + init/Kconfig | 39 ++ kernel/Kconfig.hz | 24 + kernel/fork.c | 14 + kernel/module/Kconfig | 28 +- kernel/module/decompress.c | 92 ++- kernel/rcu/Kconfig | 4 +- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree.c | 6 +- kernel/rcu/tree_nocb.h | 4 +- kernel/rcu/tree_plugin.h | 4 +- kernel/sched/fair.c | 20 +- kernel/sysctl.c | 12 + kernel/user_namespace.c | 7 + lib/Kconfig.debug | 29 +- mm/Kconfig | 2 +- mm/compaction.c | 4 + mm/page-writeback.c | 8 + mm/swap.c | 5 + mm/vmpressure.c | 4 + mm/vmscan.c | 8 + net/ipv4/sysctl_net_ipv4.c | 7 + net/ipv4/tcp_input.c | 36 + net/ipv4/tcp_ipv4.c | 2 + scripts/Makefile.debug | 6 +- scripts/Makefile.lib | 13 +- scripts/Makefile.modinst | 7 +- 60 files changed, 1636 insertions(+), 64 deletions(-) create mode 100644 arch/x86/Makefile.postlink create mode 100644 drivers/i2c/busses/i2c-nct6775.c diff --git a/.gitignore b/.gitignore index 5da004814678..cb59d89372c0 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ modules.order /vmlinux /vmlinux.32 /vmlinux.map +/vmlinux.relocs /vmlinux.symvers /vmlinux-gdb.py /vmlinuz diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6b838869554b..0cef8c4c1f28 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4737,7 +4737,7 @@ overwritten. rcutree.kthread_prio= [KNL,BOOT] - Set the SCHED_FIFO priority of the RCU per-CPU + Set the SCHED_RR priority of the RCU per-CPU kthreads (rcuc/N). This value is also used for the priority of the RCU boost threads (rcub/N) and for the RCU grace-period kthreads (rcu_bh, diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 352ff53a2306..7c210744d84c 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -255,6 +255,7 @@ vmlinux.aout vmlinux.bin.all vmlinux.lds vmlinux.map +vmlinux.relocs vmlinux.symvers vmlinuz voffset.h diff --git a/Makefile b/Makefile index a842ec6d1932..2bd738a8e481 100644 --- a/Makefile +++ b/Makefile @@ -830,6 +830,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 +else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 +KBUILD_CFLAGS += -O3 +KBUILD_RUSTFLAGS += -Copt-level=3 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s @@ -1066,11 +1069,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check -# conserve stack if available -ifdef CONFIG_CC_IS_GCC -KBUILD_CFLAGS += -fconserve-stack -endif - # Prohibit date/time macros, which would make the build non-deterministic KBUILD_CFLAGS += -Werror=date-time diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index 81764160451f..2c15d3bf747a 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index d5181275490e..7d868e148d9a 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index 2f336d99a8cf..777a9f21eb6b 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig index 899b2fd5c71d..bda15a876849 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index 0d32aac8069f..dbd74fea69aa 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index d18378d2c2a6..2396ca417182 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_DEV_RAM=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig index 3e9829775992..5044609540cc 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index 502c87f351c8..748c809d1c4c 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig index f721cc3997d0..205c32b0074c 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig index 1419fc946a08..2477b7c80977 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set CONFIG_KPROBES=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 6f0d2be9d926..cf02ad0fc210 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" CONFIG_INITRAMFS_ROOT_UID=2100 CONFIG_INITRAMFS_ROOT_GID=501 # CONFIG_RD_GZIP is not set +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index d3ef189c75f8..922b1b24f518 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index 944b347025fd..ed64319f7eb2 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 542377cd419d..08d887d1220d 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -157,7 +157,7 @@ config MPENTIUM4 config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 help Select this for an AMD K6-family processor. Enables use of @@ -165,7 +165,7 @@ config MK6 flags to GCC. config MK7 - bool "Athlon/Duron/K7" + bool "AMD Athlon/Duron/K7" depends on X86_32 help Select this for an AMD Athlon K7-family processor. Enables use of @@ -173,12 +173,106 @@ config MK7 flags to GCC. config MK8 - bool "Opteron/Athlon64/Hammer/K8" + bool "AMD Opteron/Athlon64/Hammer/K8" help Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables use of some extended instructions, and passes appropriate optimization flags to GCC. +config MK8SSE3 + bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" + help + Select this for improved AMD Opteron or Athlon64 Hammer-family processors. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MK10 + bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" + help + Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, + Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MBARCELONA + bool "AMD Barcelona" + help + Select this for AMD Family 10h Barcelona processors. + + Enables -march=barcelona + +config MBOBCAT + bool "AMD Bobcat" + help + Select this for AMD Family 14h Bobcat processors. + + Enables -march=btver1 + +config MJAGUAR + bool "AMD Jaguar" + help + Select this for AMD Family 16h Jaguar processors. + + Enables -march=btver2 + +config MBULLDOZER + bool "AMD Bulldozer" + help + Select this for AMD Family 15h Bulldozer processors. + + Enables -march=bdver1 + +config MPILEDRIVER + bool "AMD Piledriver" + help + Select this for AMD Family 15h Piledriver processors. + + Enables -march=bdver2 + +config MSTEAMROLLER + bool "AMD Steamroller" + help + Select this for AMD Family 15h Steamroller processors. + + Enables -march=bdver3 + +config MEXCAVATOR + bool "AMD Excavator" + help + Select this for AMD Family 15h Excavator processors. + + Enables -march=bdver4 + +config MZEN + bool "AMD Zen" + help + Select this for AMD Family 17h Zen processors. + + Enables -march=znver1 + +config MZEN2 + bool "AMD Zen 2" + help + Select this for AMD Family 17h Zen 2 processors. + + Enables -march=znver2 + +config MZEN3 + bool "AMD Zen 3" + depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + help + Select this for AMD Family 19h Zen 3 processors. + + Enables -march=znver3 + +config MZEN4 + bool "AMD Zen 4" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + help + Select this for AMD Family 19h Zen 4 processors. + + Enables -march=znver4 + config MCRUSOE bool "Crusoe" depends on X86_32 @@ -270,7 +364,7 @@ config MPSC in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" help Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and @@ -278,6 +372,8 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) + Enables -march=core2 + config MATOM bool "Intel Atom" help @@ -287,6 +383,202 @@ config MATOM accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. +config MNEHALEM + bool "Intel Nehalem" + select X86_P6_NOP + help + + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" + select X86_P6_NOP + help + + Select this for the Intel Westmere formerly Nehalem-C family. + + Enables -march=westmere + +config MSILVERMONT + bool "Intel Silvermont" + select X86_P6_NOP + help + + Select this for the Intel Silvermont platform. + + Enables -march=silvermont + +config MGOLDMONT + bool "Intel Goldmont" + select X86_P6_NOP + help + + Select this for the Intel Goldmont platform including Apollo Lake and Denverton. + + Enables -march=goldmont + +config MGOLDMONTPLUS + bool "Intel Goldmont Plus" + select X86_P6_NOP + help + + Select this for the Intel Goldmont Plus platform including Gemini Lake. + + Enables -march=goldmont-plus + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" + select X86_P6_NOP + help + + Select this for 2nd Gen Core processors in the Sandy Bridge family. + + Enables -march=sandybridge + +config MIVYBRIDGE + bool "Intel Ivy Bridge" + select X86_P6_NOP + help + + Select this for 3rd Gen Core processors in the Ivy Bridge family. + + Enables -march=ivybridge + +config MHASWELL + bool "Intel Haswell" + select X86_P6_NOP + help + + Select this for 4th Gen Core processors in the Haswell family. + + Enables -march=haswell + +config MBROADWELL + bool "Intel Broadwell" + select X86_P6_NOP + help + + Select this for 5th Gen Core processors in the Broadwell family. + + Enables -march=broadwell + +config MSKYLAKE + bool "Intel Skylake" + select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake family. + + Enables -march=skylake + +config MSKYLAKEX + bool "Intel Skylake X" + select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake X family. + + Enables -march=skylake-avx512 + +config MCANNONLAKE + bool "Intel Cannon Lake" + select X86_P6_NOP + help + + Select this for 8th Gen Core processors + + Enables -march=cannonlake + +config MICELAKE + bool "Intel Ice Lake" + select X86_P6_NOP + help + + Select this for 10th Gen Core processors in the Ice Lake family. + + Enables -march=icelake-client + +config MCASCADELAKE + bool "Intel Cascade Lake" + select X86_P6_NOP + help + + Select this for Xeon processors in the Cascade Lake family. + + Enables -march=cascadelake + +config MCOOPERLAKE + bool "Intel Cooper Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) + select X86_P6_NOP + help + + Select this for Xeon processors in the Cooper Lake family. + + Enables -march=cooperlake + +config MTIGERLAKE + bool "Intel Tiger Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) + select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Tiger Lake family. + + Enables -march=tigerlake + +config MSAPPHIRERAPIDS + bool "Intel Sapphire Rapids" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Sapphire Rapids family. + + Enables -march=sapphirerapids + +config MROCKETLAKE + bool "Intel Rocket Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for eleventh-generation processors in the Rocket Lake family. + + Enables -march=rocketlake + +config MALDERLAKE + bool "Intel Alder Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for twelfth-generation processors in the Alder Lake family. + + Enables -march=alderlake + +config MRAPTORLAKE + bool "Intel Raptor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + select X86_P6_NOP + help + + Select this for thirteenth-generation processors in the Raptor Lake family. + + Enables -march=raptorlake + +config MMETEORLAKE + bool "Intel Meteor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + select X86_P6_NOP + help + + Select this for fourteenth-generation processors in the Meteor Lake family. + + Enables -march=meteorlake + config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 @@ -294,6 +586,50 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. +config GENERIC_CPU2 + bool "Generic-x86-64-v2" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs with min support of x86-64-v2. + +config GENERIC_CPU3 + bool "Generic-x86-64-v3" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64-v3 CPU with v3 instructions. + Run equally well on all x86-64 CPUs with min support of x86-64-v3. + +config GENERIC_CPU4 + bool "Generic-x86-64-v4" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64 CPU with v4 instructions. + Run equally well on all x86-64 CPUs with min support of x86-64-v4. + +config MNATIVE_INTEL + bool "Intel-Native optimizations autodetected by the compiler" + help + + Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. Do NOT use this + for AMD CPUs. Intel Only! + + Enables -march=native + +config MNATIVE_AMD + bool "AMD-Native optimizations autodetected by the compiler" + help + + Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. Do NOT use this + for Intel CPUs. AMD Only! + + Enables -march=native + endchoice config X86_GENERIC @@ -318,9 +654,17 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ + || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \ + || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 \ + || GENERIC_CPU4 default "4" if MELAN || M486SX || M486 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \ + || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_F00F_BUG def_bool y @@ -332,15 +676,27 @@ config X86_INVD_BUG config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \ + || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \ + || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \ + || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ + || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \ + || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \ + || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD # # P6_NOPs are a relatively minor optimization that require a family >= @@ -356,32 +712,62 @@ config X86_USE_PPRO_CHECKSUM config X86_P6_NOP def_bool y depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \ + || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \ + || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL) config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \ + || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ + || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \ + || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL \ + || MNATIVE_AMD) || X86_64 config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \ + || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ + || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \ + || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \ + || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 \ + || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ + || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \ + || MNATIVE_INTEL || MNATIVE_AMD) default "5" if X86_32 && X86_CMPXCHG64 default "4" config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \ + || M486SX || M486) && !UML config IA32_FEAT_CTL def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3419ffa2a350..84cfee1f749f 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -67,7 +67,8 @@ export BITS # # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 \ + -mno-avx512f -O3 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 ifeq ($(CONFIG_X86_KERNEL_IBT),y) @@ -151,8 +152,47 @@ else # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += -march=k8 cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom + cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 + cflags-$(CONFIG_MK10) += -march=amdfam10 + cflags-$(CONFIG_MBARCELONA) += -march=barcelona + cflags-$(CONFIG_MBOBCAT) += -march=btver1 + cflags-$(CONFIG_MJAGUAR) += -march=btver2 + cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 + cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm + cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm + cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm + cflags-$(CONFIG_MZEN) += -march=znver1 + cflags-$(CONFIG_MZEN2) += -march=znver2 + cflags-$(CONFIG_MZEN3) += -march=znver3 + cflags-$(CONFIG_MZEN4) += -march=znver4 + cflags-$(CONFIG_MNATIVE_INTEL) += -march=native + cflags-$(CONFIG_MNATIVE_AMD) += -march=native + cflags-$(CONFIG_MATOM) += -march=bonnell + cflags-$(CONFIG_MCORE2) += -march=core2 + cflags-$(CONFIG_MNEHALEM) += -march=nehalem + cflags-$(CONFIG_MWESTMERE) += -march=westmere + cflags-$(CONFIG_MSILVERMONT) += -march=silvermont + cflags-$(CONFIG_MGOLDMONT) += -march=goldmont + cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus + cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge + cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge + cflags-$(CONFIG_MHASWELL) += -march=haswell + cflags-$(CONFIG_MBROADWELL) += -march=broadwell + cflags-$(CONFIG_MSKYLAKE) += -march=skylake + cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 + cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake + cflags-$(CONFIG_MICELAKE) += -march=icelake-client + cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake + cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake + cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake + cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids + cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake + cflags-$(CONFIG_MALDERLAKE) += -march=alderlake + cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake + cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake + cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 + cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 + cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 index 000000000000..b38ffa4defb3 --- /dev/null +++ b/arch/x86/Makefile.postlink @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link x86 pass +# =========================================================================== +# +# 1. Separate relocations from vmlinux into vmlinux.relocs. +# 2. Strip relocations from vmlinux. + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@.relocs + cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@ + +quiet_cmd_strip_relocs = RSTRIP $@ + cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@ + +# `@true` prevents complaint when there is nothing to be done + +vmlinux: FORCE + @true +ifeq ($(CONFIG_X86_NEED_RELOCS),y) + $(call cmd,relocs) + $(call cmd,strip_relocs) +endif + +%.ko: FORCE + @true + +clean: + @rm -f vmlinux.relocs + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 25805199a506..b2968175fc27 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only relocs vmlinux.bin.all -vmlinux.relocs vmlinux.lds mkpiggy piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a261abb6d15..75a467a408d2 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = arch/x86/tools/relocs -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE - $(call if_changed,relocs) +# vmlinux.relocs is created by the vmlinux postlink step. +vmlinux.relocs: vmlinux + @true vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..18021e8c0c28 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,52 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MNATIVE_INTEL +#define MODULE_PROC_FAMILY "NATIVE_INTEL " +#elif defined CONFIG_MNATIVE_AMD +#define MODULE_PROC_FAMILY "NATIVE_AMD " +#elif defined CONFIG_MNEHALEM +#define MODULE_PROC_FAMILY "NEHALEM " +#elif defined CONFIG_MWESTMERE +#define MODULE_PROC_FAMILY "WESTMERE " +#elif defined CONFIG_MSILVERMONT +#define MODULE_PROC_FAMILY "SILVERMONT " +#elif defined CONFIG_MGOLDMONT +#define MODULE_PROC_FAMILY "GOLDMONT " +#elif defined CONFIG_MGOLDMONTPLUS +#define MODULE_PROC_FAMILY "GOLDMONTPLUS " +#elif defined CONFIG_MSANDYBRIDGE +#define MODULE_PROC_FAMILY "SANDYBRIDGE " +#elif defined CONFIG_MIVYBRIDGE +#define MODULE_PROC_FAMILY "IVYBRIDGE " +#elif defined CONFIG_MHASWELL +#define MODULE_PROC_FAMILY "HASWELL " +#elif defined CONFIG_MBROADWELL +#define MODULE_PROC_FAMILY "BROADWELL " +#elif defined CONFIG_MSKYLAKE +#define MODULE_PROC_FAMILY "SKYLAKE " +#elif defined CONFIG_MSKYLAKEX +#define MODULE_PROC_FAMILY "SKYLAKEX " +#elif defined CONFIG_MCANNONLAKE +#define MODULE_PROC_FAMILY "CANNONLAKE " +#elif defined CONFIG_MICELAKE +#define MODULE_PROC_FAMILY "ICELAKE " +#elif defined CONFIG_MCASCADELAKE +#define MODULE_PROC_FAMILY "CASCADELAKE " +#elif defined CONFIG_MCOOPERLAKE +#define MODULE_PROC_FAMILY "COOPERLAKE " +#elif defined CONFIG_MTIGERLAKE +#define MODULE_PROC_FAMILY "TIGERLAKE " +#elif defined CONFIG_MSAPPHIRERAPIDS +#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " +#elif defined CONFIG_ROCKETLAKE +#define MODULE_PROC_FAMILY "ROCKETLAKE " +#elif defined CONFIG_MALDERLAKE +#define MODULE_PROC_FAMILY "ALDERLAKE " +#elif defined CONFIG_MRAPTORLAKE +#define MODULE_PROC_FAMILY "RAPTORLAKE " +#elif defined CONFIG_MMETEORLAKE +#define MODULE_PROC_FAMILY "METEORLAKE " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -35,6 +81,32 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MK8SSE3 +#define MODULE_PROC_FAMILY "K8SSE3 " +#elif defined CONFIG_MK10 +#define MODULE_PROC_FAMILY "K10 " +#elif defined CONFIG_MBARCELONA +#define MODULE_PROC_FAMILY "BARCELONA " +#elif defined CONFIG_MBOBCAT +#define MODULE_PROC_FAMILY "BOBCAT " +#elif defined CONFIG_MBULLDOZER +#define MODULE_PROC_FAMILY "BULLDOZER " +#elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "PILEDRIVER " +#elif defined CONFIG_MSTEAMROLLER +#define MODULE_PROC_FAMILY "STEAMROLLER " +#elif defined CONFIG_MJAGUAR +#define MODULE_PROC_FAMILY "JAGUAR " +#elif defined CONFIG_MEXCAVATOR +#define MODULE_PROC_FAMILY "EXCAVATOR " +#elif defined CONFIG_MZEN +#define MODULE_PROC_FAMILY "ZEN " +#elif defined CONFIG_MZEN2 +#define MODULE_PROC_FAMILY "ZEN2 " +#elif defined CONFIG_MZEN3 +#define MODULE_PROC_FAMILY "ZEN3 " +#elif defined CONFIG_MZEN4 +#define MODULE_PROC_FAMILY "ZEN4 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/block/elevator.c b/block/elevator.c index bd71f0fc4e4b..389cb51389af 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -640,8 +640,13 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) +#if defined(CONFIG_CACHY) && defined(CONFIG_MQ_IOSCHED_KYBER) + return elevator_get(q, "kyber", false); +#elif defined(CONFIG_CACHY) + return elevator_get(q, "mq-deadline", false); +#else return NULL; - +#endif return elevator_get(q, "mq-deadline", false); } diff --git a/drivers/Makefile b/drivers/Makefile index bdf1c66141c9..1e1a0832fb48 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -59,15 +59,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb and intelfb depend on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -79,6 +72,14 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb and intelfb depend on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index e50f9603d189..ef652f896599 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -229,6 +229,15 @@ config I2C_CHT_WC combined with a FUSB302 Type-C port-controller as such it is advised to also select CONFIG_TYPEC_FUSB302=m. +config I2C_NCT6775 + tristate "Nuvoton NCT6775 and compatible SMBus controller" + help + If you say yes to this option, support will be included for the + Nuvoton NCT6775 and compatible SMBus controllers. + + This driver can also be built as a module. If so, the module + will be called i2c-nct6775. + config I2C_NFORCE2 tristate "Nvidia nForce2, nForce3 and nForce4" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index e73cdb1d2b5a..052ccd05c13c 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o obj-$(CONFIG_I2C_I801) += i2c-i801.o obj-$(CONFIG_I2C_ISCH) += i2c-isch.o obj-$(CONFIG_I2C_ISMT) += i2c-ismt.o +obj-$(CONFIG_I2C_NCT6775) += i2c-nct6775.o obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o obj-$(CONFIG_I2C_NFORCE2_S4985) += i2c-nforce2-s4985.o obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c new file mode 100644 index 000000000000..0462f0952043 --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6775.c @@ -0,0 +1,647 @@ +/* + * i2c-nct6775 - Driver for the SMBus master functionality of + * Nuvoton NCT677x Super-I/O chips + * + * Copyright (C) 2019 Adam Honse + * + * Derived from nct6775 hwmon driver + * Copyright (C) 2012 Guenter Roeck + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRVNAME "i2c-nct6775" + +/* Nuvoton SMBus address offsets */ +#define SMBHSTDAT (0 + nuvoton_nct6793d_smba) +#define SMBBLKSZ (1 + nuvoton_nct6793d_smba) +#define SMBHSTCMD (2 + nuvoton_nct6793d_smba) +#define SMBHSTIDX (3 + nuvoton_nct6793d_smba) //Index field is the Command field on other controllers +#define SMBHSTCTL (4 + nuvoton_nct6793d_smba) +#define SMBHSTADD (5 + nuvoton_nct6793d_smba) +#define SMBHSTERR (9 + nuvoton_nct6793d_smba) +#define SMBHSTSTS (0xE + nuvoton_nct6793d_smba) + +/* Command register */ +#define NCT6793D_READ_BYTE 0 +#define NCT6793D_READ_WORD 1 +#define NCT6793D_READ_BLOCK 2 +#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3 +#define NCT6793D_PROC_CALL 4 +#define NCT6793D_WRITE_BYTE 8 +#define NCT6793D_WRITE_WORD 9 +#define NCT6793D_WRITE_BLOCK 10 + +/* Control register */ +#define NCT6793D_MANUAL_START 128 +#define NCT6793D_SOFT_RESET 64 + +/* Error register */ +#define NCT6793D_NO_ACK 32 + +/* Status register */ +#define NCT6793D_FIFO_EMPTY 1 +#define NCT6793D_FIFO_FULL 2 +#define NCT6793D_MANUAL_ACTIVE 4 + +#define NCT6775_LD_SMBUS 0x0B + +/* Other settings */ +#define MAX_RETRIES 400 + +enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793, + nct6795, nct6796, nct6798 }; + +struct nct6775_sio_data { + int sioreg; + enum kinds kind; +}; + +/* used to set data->name = nct6775_device_names[data->sio_kind] */ +static const char * const nct6775_device_names[] = { + "nct6106", + "nct6775", + "nct6776", + "nct6779", + "nct6791", + "nct6792", + "nct6793", + "nct6795", + "nct6796", + "nct6798", +}; + +static const char * const nct6775_sio_names[] __initconst = { + "NCT6106D", + "NCT6775F", + "NCT6776D/F", + "NCT6779D", + "NCT6791D", + "NCT6792D", + "NCT6793D", + "NCT6795D", + "NCT6796D", + "NCT6798D", +}; + +#define SIO_REG_LDSEL 0x07 /* Logical device select */ +#define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ +#define SIO_REG_SMBA 0x62 /* SMBus base address register */ + +#define SIO_NCT6106_ID 0xc450 +#define SIO_NCT6775_ID 0xb470 +#define SIO_NCT6776_ID 0xc330 +#define SIO_NCT6779_ID 0xc560 +#define SIO_NCT6791_ID 0xc800 +#define SIO_NCT6792_ID 0xc910 +#define SIO_NCT6793_ID 0xd120 +#define SIO_NCT6795_ID 0xd350 +#define SIO_NCT6796_ID 0xd420 +#define SIO_NCT6798_ID 0xd428 +#define SIO_ID_MASK 0xFFF0 + +static inline void +superio_outb(int ioreg, int reg, int val) +{ + outb(reg, ioreg); + outb(val, ioreg + 1); +} + +static inline int +superio_inb(int ioreg, int reg) +{ + outb(reg, ioreg); + return inb(ioreg + 1); +} + +static inline void +superio_select(int ioreg, int ld) +{ + outb(SIO_REG_LDSEL, ioreg); + outb(ld, ioreg + 1); +} + +static inline int +superio_enter(int ioreg) +{ + /* + * Try to reserve and for exclusive access. + */ + if (!request_muxed_region(ioreg, 2, DRVNAME)) + return -EBUSY; + + outb(0x87, ioreg); + outb(0x87, ioreg); + + return 0; +} + +static inline void +superio_exit(int ioreg) +{ + outb(0xaa, ioreg); + outb(0x02, ioreg); + outb(0x02, ioreg + 1); + release_region(ioreg, 2); +} + +/* + * ISA constants + */ + +#define IOREGION_ALIGNMENT (~7) +#define IOREGION_LENGTH 2 +#define ADDR_REG_OFFSET 0 +#define DATA_REG_OFFSET 1 + +#define NCT6775_REG_BANK 0x4E +#define NCT6775_REG_CONFIG 0x40 + +static struct i2c_adapter *nct6775_adapter; + +struct i2c_nct6775_adapdata { + unsigned short smba; +}; + +/* Return negative errno on error. */ +static s32 nct6775_access(struct i2c_adapter * adap, u16 addr, + unsigned short flags, char read_write, + u8 command, int size, union i2c_smbus_data * data) +{ + struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); + unsigned short nuvoton_nct6793d_smba = adapdata->smba; + int i, len, cnt; + union i2c_smbus_data tmp_data; + int timeout = 0; + + tmp_data.word = 0; + cnt = 0; + len = 0; + + outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL); + + switch (size) { + case I2C_SMBUS_QUICK: + outb_p((addr << 1) | read_write, + SMBHSTADD); + break; + case I2C_SMBUS_BYTE_DATA: + tmp_data.byte = data->byte; + case I2C_SMBUS_BYTE: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + outb_p(tmp_data.byte, SMBHSTDAT); + outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD); + } + else { + outb_p(NCT6793D_READ_BYTE, SMBHSTCMD); + } + break; + case I2C_SMBUS_WORD_DATA: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + outb_p(data->word & 0xff, SMBHSTDAT); + outb_p((data->word & 0xff00) >> 8, SMBHSTDAT); + outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD); + } + else { + outb_p(NCT6793D_READ_WORD, SMBHSTCMD); + } + break; + case I2C_SMBUS_BLOCK_DATA: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + len = data->block[0]; + if (len == 0 || len > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + outb_p(len, SMBBLKSZ); + + cnt = 1; + if (len >= 4) { + for (i = cnt; i <= 4; i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len -= 4; + cnt += 4; + } + else { + for (i = cnt; i <= len; i++ ) { + outb_p(data->block[i], SMBHSTDAT); + } + + len = 0; + } + + outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD); + } + else { + return -ENOTSUPP; + } + break; + default: + dev_warn(&adap->dev, "Unsupported transaction %d\n", size); + return -EOPNOTSUPP; + } + + outb_p(NCT6793D_MANUAL_START, SMBHSTCTL); + + while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) { + if (read_write == I2C_SMBUS_WRITE) { + timeout = 0; + while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0) + { + if(timeout > MAX_RETRIES) + { + return -ETIMEDOUT; + } + usleep_range(250, 500); + timeout++; + } + + //Load more bytes into FIFO + if (len >= 4) { + for (i = cnt; i <= (cnt + 4); i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len -= 4; + cnt += 4; + } + else { + for (i = cnt; i <= (cnt + len); i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len = 0; + } + } + else { + return -ENOTSUPP; + } + + } + + //wait for manual mode to complete + timeout = 0; + while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0) + { + if(timeout > MAX_RETRIES) + { + return -ETIMEDOUT; + } + usleep_range(250, 500); + timeout++; + } + + if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) { + return -ENXIO; + } + else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) { + return 0; + } + + switch (size) { + case I2C_SMBUS_QUICK: + case I2C_SMBUS_BYTE_DATA: + data->byte = inb_p(SMBHSTDAT); + break; + case I2C_SMBUS_WORD_DATA: + data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8); + break; + } + return 0; +} + +static u32 nct6775_func(struct i2c_adapter *adapter) +{ + return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | + I2C_FUNC_SMBUS_BLOCK_DATA; +} + +static const struct i2c_algorithm smbus_algorithm = { + .smbus_xfer = nct6775_access, + .functionality = nct6775_func, +}; + +static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap) +{ + struct i2c_adapter *adap; + struct i2c_nct6775_adapdata *adapdata; + int retval; + + adap = kzalloc(sizeof(*adap), GFP_KERNEL); + if (adap == NULL) { + return -ENOMEM; + } + + adap->owner = THIS_MODULE; + adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adap->algo = &smbus_algorithm; + + adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL); + if (adapdata == NULL) { + kfree(adap); + return -ENOMEM; + } + + adapdata->smba = smba; + + snprintf(adap->name, sizeof(adap->name), + "SMBus NCT67xx adapter%s at %04x", name, smba); + + i2c_set_adapdata(adap, adapdata); + + retval = i2c_add_adapter(adap); + if (retval) { + kfree(adapdata); + kfree(adap); + return retval; + } + + *padap = adap; + return 0; +} + +static void nct6775_remove_adapter(struct i2c_adapter *adap) +{ + struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); + + if (adapdata->smba) { + i2c_del_adapter(adap); + kfree(adapdata); + kfree(adap); + } +} + +//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume); + +/* + * when Super-I/O functions move to a separate file, the Super-I/O + * bus will manage the lifetime of the device and this module will only keep + * track of the nct6775 driver. But since we use platform_device_alloc(), we + * must keep track of the device + */ +static struct platform_device *pdev[2]; + +static int nct6775_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6775_sio_data *sio_data = dev_get_platdata(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH, + DRVNAME)) + return -EBUSY; + + switch (sio_data->kind) { + case nct6791: + case nct6792: + case nct6793: + case nct6795: + case nct6796: + case nct6798: + nct6775_add_adapter(res->start, "", &nct6775_adapter); + break; + default: + return -ENODEV; + } + + return 0; +} +/* +static void nct6791_enable_io_mapping(int sioaddr) +{ + int val; + + val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE); + if (val & 0x10) { + pr_info("Enabling hardware monitor logical device mappings.\n"); + superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE, + val & ~0x10); + } +}*/ + +static struct platform_driver i2c_nct6775_driver = { + .driver = { + .name = DRVNAME, +// .pm = &nct6775_dev_pm_ops, + }, + .probe = nct6775_probe, +}; + +static void __exit i2c_nct6775_exit(void) +{ + int i; + + if(nct6775_adapter) + nct6775_remove_adapter(nct6775_adapter); + + for (i = 0; i < ARRAY_SIZE(pdev); i++) { + if (pdev[i]) + platform_device_unregister(pdev[i]); + } + platform_driver_unregister(&i2c_nct6775_driver); +} + +/* nct6775_find() looks for a '627 in the Super-I/O config space */ +static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data) +{ + u16 val; + int err; + int addr; + + err = superio_enter(sioaddr); + if (err) + return err; + + val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) | + superio_inb(sioaddr, SIO_REG_DEVID + 1); + + switch (val & SIO_ID_MASK) { + case SIO_NCT6106_ID: + sio_data->kind = nct6106; + break; + case SIO_NCT6775_ID: + sio_data->kind = nct6775; + break; + case SIO_NCT6776_ID: + sio_data->kind = nct6776; + break; + case SIO_NCT6779_ID: + sio_data->kind = nct6779; + break; + case SIO_NCT6791_ID: + sio_data->kind = nct6791; + break; + case SIO_NCT6792_ID: + sio_data->kind = nct6792; + break; + case SIO_NCT6793_ID: + sio_data->kind = nct6793; + break; + case SIO_NCT6795_ID: + sio_data->kind = nct6795; + break; + case SIO_NCT6796_ID: + sio_data->kind = nct6796; + break; + case SIO_NCT6798_ID: + sio_data->kind = nct6798; + break; + default: + if (val != 0xffff) + pr_debug("unsupported chip ID: 0x%04x\n", val); + superio_exit(sioaddr); + return -ENODEV; + } + + /* We have a known chip, find the SMBus I/O address */ + superio_select(sioaddr, NCT6775_LD_SMBUS); + val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8) + | superio_inb(sioaddr, SIO_REG_SMBA + 1); + addr = val & IOREGION_ALIGNMENT; + if (addr == 0) { + pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n"); + superio_exit(sioaddr); + return -ENODEV; + } + + //if (sio_data->kind == nct6791 || sio_data->kind == nct6792 || + // sio_data->kind == nct6793 || sio_data->kind == nct6795 || + // sio_data->kind == nct6796) + // nct6791_enable_io_mapping(sioaddr); + + superio_exit(sioaddr); + pr_info("Found %s or compatible chip at %#x:%#x\n", + nct6775_sio_names[sio_data->kind], sioaddr, addr); + sio_data->sioreg = sioaddr; + + return addr; +} + +static int __init i2c_nct6775_init(void) +{ + int i, err; + bool found = false; + int address; + struct resource res; + struct nct6775_sio_data sio_data; + int sioaddr[2] = { 0x2e, 0x4e }; + + err = platform_driver_register(&i2c_nct6775_driver); + if (err) + return err; + + /* + * initialize sio_data->kind and sio_data->sioreg. + * + * when Super-I/O functions move to a separate file, the Super-I/O + * driver will probe 0x2e and 0x4e and auto-detect the presence of a + * nct6775 hardware monitor, and call probe() + */ + for (i = 0; i < ARRAY_SIZE(pdev); i++) { + address = nct6775_find(sioaddr[i], &sio_data); + if (address <= 0) + continue; + + found = true; + + pdev[i] = platform_device_alloc(DRVNAME, address); + if (!pdev[i]) { + err = -ENOMEM; + goto exit_device_unregister; + } + + err = platform_device_add_data(pdev[i], &sio_data, + sizeof(struct nct6775_sio_data)); + if (err) + goto exit_device_put; + + memset(&res, 0, sizeof(res)); + res.name = DRVNAME; + res.start = address; + res.end = address + IOREGION_LENGTH - 1; + res.flags = IORESOURCE_IO; + + err = acpi_check_resource_conflict(&res); + if (err) { + platform_device_put(pdev[i]); + pdev[i] = NULL; + continue; + } + + err = platform_device_add_resources(pdev[i], &res, 1); + if (err) + goto exit_device_put; + + /* platform_device_add calls probe() */ + err = platform_device_add(pdev[i]); + if (err) + goto exit_device_put; + } + if (!found) { + err = -ENODEV; + goto exit_unregister; + } + + return 0; + +exit_device_put: + platform_device_put(pdev[i]); +exit_device_unregister: + while (--i >= 0) { + if (pdev[i]) + platform_device_unregister(pdev[i]); + } +exit_unregister: + platform_driver_unregister(&i2c_nct6775_driver); + return err; +} + +MODULE_AUTHOR("Adam Honse "); +MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips"); +MODULE_LICENSE("GPL"); + +module_init(i2c_nct6775_init); +module_exit(i2c_nct6775_exit); diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index 809fbd014cd6..d54b35b147ee 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */ usleep_range(2000, 2100); else - usleep_range(250, 500); + usleep_range(25, 50); while ((++timeout < MAX_TIMEOUT) && ((temp = inb_p(SMBHSTSTS)) & 0x01)) - usleep_range(250, 500); + usleep_range(25, 50); /* If the SMBus is still busy, we give up */ if (timeout == MAX_TIMEOUT) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2653516bcdef..973fe8f80051 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3207,6 +3207,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } +#ifdef CONFIG_CACHY + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); +#endif + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); if (ret < 0) goto bad; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb4044222..0a11325c35fa 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1181,7 +1181,7 @@ struct readahead_control { ._index = i, \ } -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) +#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 45f09bec02c4..87b20e2ee274 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +extern int unprivileged_userns_clone; + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) @@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else +#define unprivileged_userns_clone 0 + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1b8004679445..d6d7f9942b97 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -188,6 +188,7 @@ struct netns_ipv4 { int sysctl_udp_rmem_min; u8 sysctl_fib_notify_on_flag_change; + unsigned int sysctl_tcp_collapse_max_bytes; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_udp_l3mdev_accept; diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 901b440238d5..7026df84a0f6 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); +DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), diff --git a/init/Kconfig b/init/Kconfig index 0c214af99085..b5ed2afd0d0a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK menu "General setup" +config CACHY + bool "Some kernel tweaks by CachyOS" + default y + config BROKEN bool @@ -346,6 +350,19 @@ config KERNEL_UNCOMPRESSED endchoice +menu "ZSTD compression options" + depends on KERNEL_ZSTD + +config ZSTD_COMPRESSION_LEVEL + int "Compression level (1-22)" + range 1 22 + default "22" + help + Choose a compression level for zstd kernel compression. + Default is 22, which is the maximum. + +endmenu + config DEFAULT_INIT string "Default init path" default "" @@ -1251,6 +1268,22 @@ config USER_NS If unsure, say N. +config USER_NS_UNPRIVILEGED + bool "Allow unprivileged users to create namespaces" + default y + depends on USER_NS + help + When disabled, unprivileged users will not be able to create + new namespaces. Allowing users to create their own namespaces + has been part of several recent local privilege escalation + exploits, so if you need user namespaces but are + paranoid^Wsecurity-conscious you want to disable this. + + This setting can be overridden at runtime via the + kernel.unprivileged_userns_clone sysctl. + + If unsure, say Y. + config PID_NS bool "PID Namespaces" default y @@ -1418,6 +1451,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. +config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. + config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (-Os)" help diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..0f78364efd4f 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -40,6 +40,27 @@ choice on SMP and NUMA systems and exactly dividing by both PAL and NTSC frame rates for video and multimedia work. + config HZ_500 + bool "500 HZ" + help + 500 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_600 + bool "600 HZ" + help + 600 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_750 + bool "750 HZ" + help + 750 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + config HZ_1000 bool "1000 HZ" help @@ -53,6 +74,9 @@ config HZ default 100 if HZ_100 default 250 if HZ_250 default 300 if HZ_300 + default 500 if HZ_500 + default 600 if HZ_600 + default 750 if HZ_750 default 1000 if HZ_1000 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c index a6d243a50be3..b9f15288f64a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -98,6 +98,10 @@ #include #include +#ifdef CONFIG_USER_NS +#include +#endif + #include #include #include @@ -2011,6 +2015,10 @@ static __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -3171,6 +3179,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { + err = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto bad_unshare_out; + } + err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 26ea5d04f56c..ecf2798c5ccf 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -219,11 +219,37 @@ config MODULE_COMPRESS_ZSTD endchoice +menu "ZSTD module compression options" + depends on MODULE_COMPRESS_ZSTD + +config MODULE_COMPRESS_ZSTD_LEVEL + int "Compression level (1-19)" + range 1 19 + default 9 + help + Compression level used by zstd for compressing modules. + +config MODULE_COMPRESS_ZSTD_ULTRA + bool "Enable ZSTD ultra compression" + help + Compress modules with ZSTD using the highest possible compression. + +config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA + int "Compression level (20-22)" + depends on MODULE_COMPRESS_ZSTD_ULTRA + range 20 22 + default 20 + help + Ultra compression level used by zstd for compressing modules. + +endmenu + config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ + select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help Support for decompressing kernel modules by the kernel itself diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 720e719253cd..bb79ac1a6d8f 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info) return page; } -#ifdef CONFIG_MODULE_COMPRESS_GZIP +#if defined(CONFIG_MODULE_COMPRESS_GZIP) #include #define MODULE_COMPRESSION gzip #define MODULE_DECOMPRESS_FN module_gzip_decompress @@ -141,7 +141,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, kfree(s.workspace); return retval; } -#elif CONFIG_MODULE_COMPRESS_XZ +#elif defined(CONFIG_MODULE_COMPRESS_XZ) #include #define MODULE_COMPRESSION xz #define MODULE_DECOMPRESS_FN module_xz_decompress @@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info, xz_dec_end(xz_dec); return retval; } +#elif defined(CONFIG_MODULE_COMPRESS_ZSTD) +#include +#define MODULE_COMPRESSION zstd +#define MODULE_DECOMPRESS_FN module_zstd_decompress + +static ssize_t module_zstd_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd }; + ZSTD_outBuffer zstd_dec; + ZSTD_inBuffer zstd_buf; + zstd_frame_header header; + size_t wksp_size; + void *wksp = NULL; + ZSTD_DStream *dstream; + size_t ret; + size_t new_size = 0; + int retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not a zstd compressed module\n"); + return -EINVAL; + } + + zstd_buf.src = buf; + zstd_buf.pos = 0; + zstd_buf.size = size; + + ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size); + if (ret != 0) { + pr_err("ZSTD-compressed data has an incomplete frame header\n"); + retval = -EINVAL; + goto out; + } + if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) { + pr_err("ZSTD-compressed data has too large a window size\n"); + retval = -EINVAL; + goto out; + } + + wksp_size = zstd_dstream_workspace_bound(header.windowSize); + wksp = kmalloc(wksp_size, GFP_KERNEL); + if (!wksp) { + retval = -ENOMEM; + goto out; + } + + dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size); + if (!dstream) { + pr_err("Can't initialize ZSTD stream\n"); + retval = -ENOMEM; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + + if (!IS_ERR(page)) { + retval = PTR_ERR(page); + goto out; + } + + zstd_dec.dst = kmap_local_page(page); + zstd_dec.pos = 0; + zstd_dec.size = PAGE_SIZE; + + ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); + kunmap(page); + retval = zstd_get_error_code(ret); + if (retval) + break; + + new_size += zstd_dec.pos; + } while (zstd_dec.pos == PAGE_SIZE && ret != 0); + + if (retval) { + pr_err("ZSTD-decompression failed with status %d\n", retval); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + kfree(wksp); + return retval; +} #else #error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" #endif diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index d471d22a5e21..e945e522969e 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -282,9 +282,9 @@ config RCU_NOCB_CPU_CB_BOOST depends on RCU_NOCB_CPU && RCU_BOOST default y if PREEMPT_RT help - Use this option to invoke offloaded callbacks as SCHED_FIFO + Use this option to invoke offloaded callbacks as SCHED_RR to avoid starvation by heavy SCHED_OTHER background load. - Of course, running as SCHED_FIFO during callback floods will + Of course, running as SCHED_RR during callback floods will cause the rcuo[ps] kthreads to monopolize the CPU for hundreds of milliseconds or more. Therefore, when enabling this option, it is your responsibility to ensure that latency-sensitive diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 503c2aa845a4..b58e64ac6080 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2362,7 +2362,7 @@ static int rcutorture_booster_init(unsigned int cpu) t = per_cpu(ksoftirqd, cpu); WARN_ON_ONCE(!t); sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } /* Don't allow time recalculation while creating a new task. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d9384fba05..4fe2c6d8d705 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4401,8 +4401,8 @@ static void __init rcu_start_exp_gp_kworkers(void) return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR, ¶m); } @@ -4440,7 +4440,7 @@ static int __init rcu_spawn_gp_kthread(void) return 0; if (kthread_prio) { sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 0a5f0ef41484..035c5fe00370 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1319,7 +1319,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1330,7 +1330,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) goto end; if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e3142ee35fc6..f543a01cdd58 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) struct sched_param sp; sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(current, SCHED_RR, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ WRITE_ONCE(rdp->rcuc_activity, jiffies); @@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ out: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c3d0d49c80e..a1e6338637cf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ * * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_latency = 3000000ULL; +static unsigned int normalized_sysctl_sched_latency = 3000000ULL; +#else unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL; - +#endif /* * The initial- and re-scaling of tunables is configurable * @@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_min_granularity = 400000ULL; +static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL; +#else unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +#endif /* * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. @@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_wakeup_granularity = 500000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; +#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +#endif const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_CACHY +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c6d9dec11b74..1c7c7c953876 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif +#ifdef CONFIG_USER_NS +#include +#endif static const int ngroups_max = NGROUPS_MAX; @@ -1659,6 +1662,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_USER_NS + { + .procname = "unprivileged_userns_clone", + .data = &unprivileged_userns_clone, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 54211dbd516c..16ca0c151629 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ #include #include +/* sysctl */ +#ifdef CONFIG_USER_NS_UNPRIVILEGED +int unprivileged_userns_clone = 1; +#else +int unprivileged_userns_clone; +#endif + static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 12dfe6691dd5..47d465423ec3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -312,8 +312,21 @@ config DEBUG_INFO_REDUCED DEBUG_INFO build and compile times are reduced too. Only works with newer gcc versions. -config DEBUG_INFO_COMPRESSED - bool "Compressed debugging information" +choice + prompt "Compressed Debug information" + help + Compress the resulting debug info. Results in smaller debug info sections, + but requires that consumers are able to decompress the results. + + If unsure, choose DEBUG_INFO_COMPRESSED_NONE. + +config DEBUG_INFO_COMPRESSED_NONE + bool "Don't compress debug information" + help + Don't compress debug info sections. + +config DEBUG_INFO_COMPRESSED_ZLIB + bool "Compress debugging information with zlib" depends on $(cc-option,-gz=zlib) depends on $(ld-option,--compress-debug-sections=zlib) help @@ -327,6 +340,18 @@ config DEBUG_INFO_COMPRESSED preferable to setting $KDEB_COMPRESS to "none" which would be even larger. +config DEBUG_INFO_COMPRESSED_ZSTD + bool "Compress debugging information with zstd" + depends on $(cc-option,-gz=zstd) + depends on $(ld-option,--compress-debug-sections=zstd) + help + Compress the debug information using zstd. This may provide better + compression than zlib, for about the same time costs, but requires newer + toolchain support. Requires GCC 13.0+ or Clang 16.0+, binutils 2.40+, and + zstd. + +endchoice # "Compressed Debug information" + config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" depends on $(cc-option,-gsplit-dwarf) diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..d66ec503a7ca 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -576,7 +576,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || CACHY default 1 # diff --git a/mm/compaction.c b/mm/compaction.c index 8238e83385a7..d0b16a5b30f7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2717,7 +2717,11 @@ static void compact_nodes(void) * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ +#ifdef CONFIG_CACHY +unsigned int __read_mostly sysctl_compaction_proactiveness; +#else unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +#endif int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e9d8d857ecc..5cd27d0871b6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,7 +70,11 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ +#ifdef CONFIG_CACHY +static int dirty_background_ratio = 5; +#else static int dirty_background_ratio = 10; +#endif /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -98,7 +102,11 @@ static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ +#ifdef CONFIG_CACHY +unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */ +#else unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +#endif EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/swap.c b/mm/swap.c index 955930f41d20..6e0976bf6b7d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1113,6 +1113,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); */ void __init swap_setup(void) { +#ifdef CONFIG_CACHY + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ @@ -1124,4 +1128,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif } diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b52644771cc4..11a4b0e3b583 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ +#ifdef CONFIG_CACHY +static const unsigned int vmpressure_level_med = 65; +#else static const unsigned int vmpressure_level_med = 60; +#endif static const unsigned int vmpressure_level_critical = 95; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index dc66f6715bfc..f50a427e15fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -189,7 +189,11 @@ struct scan_control { /* * From 0 .. 200. Higher means more swappy. */ +#ifdef CONFIG_CACHY +int vm_swappiness = 20; +#else int vm_swappiness = 60; +#endif static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) @@ -4529,7 +4533,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_CACHY +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9b8a6db7a66b..0e364b98c4b3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1384,6 +1384,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_collapse_max_bytes", + .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a455eb0c552..71cdd9e884f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5402,6 +5402,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) static int tcp_prune_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); @@ -5413,6 +5414,39 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; + /* For context and additional information about this patch, see the + * blog post at + * + * sysctl: net.ipv4.tcp_collapse_max_bytes + * + * If tcp_collapse_max_bytes is non-zero, attempt to collapse the + * queue to free up memory if the current amount of memory allocated + * is less than tcp_collapse_max_bytes. Otherwise, the packet is + * dropped without attempting to collapse the queue. + * + * If tcp_collapse_max_bytes is zero, this feature is disabled + * and the default Linux behavior is used. The default Linux + * behavior is to always perform the attempt to collapse the + * queue to free up memory. + * + * When the receive queue is small, we want to collapse the + * queue. There are two reasons for this: (a) the latency of + * performing the collapse will be small on a small queue, and + * (b) we want to avoid sending a congestion signal (via a + * packet drop) to the sender when the receive queue is small. + * + * The result is that we avoid latency spikes caused by the + * time it takes to perform the collapse logic when the receive + * queue is large and full, while preserving existing behavior + * and performance for all other cases. + */ + if (net->ipv4.sysctl_tcp_collapse_max_bytes && + (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { + /* We are dropping the packet */ + trace_tcp_collapse_max_bytes_exceeded(sk); + goto do_not_collapse; + } + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, @@ -5431,6 +5465,8 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; +do_not_collapse: + /* If we are really being abused, tell the caller to silently * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index da46357f501b..77967b2db29e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3215,6 +3215,8 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; + net->ipv4.sysctl_tcp_collapse_max_bytes = 0; + return 0; } diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug index 332c486f705f..059ff38fe0cb 100644 --- a/scripts/Makefile.debug +++ b/scripts/Makefile.debug @@ -27,10 +27,14 @@ else DEBUG_RUSTFLAGS += -Cdebuginfo=2 endif -ifdef CONFIG_DEBUG_INFO_COMPRESSED +ifdef CONFIG_DEBUG_INFO_COMPRESSED_ZLIB DEBUG_CFLAGS += -gz=zlib KBUILD_AFLAGS += -gz=zlib KBUILD_LDFLAGS += --compress-debug-sections=zlib +else ifdef CONFIG_DEBUG_INFO_COMPRESSED_ZSTD +DEBUG_CFLAGS += -gz=zstd +KBUILD_AFLAGS += -gz=zstd +KBUILD_LDFLAGS += --compress-debug-sections=zstd endif KBUILD_CFLAGS += $(DEBUG_CFLAGS) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3aa384cec76b..0d5963a95572 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -526,14 +526,21 @@ quiet_cmd_xzmisc = XZMISC $@ # decompression is used, like initramfs decompression, zstd22 should likely not # be used because it would require zstd to allocate a 128 MB buffer. +ifdef CONFIG_ZSTD_COMPRESSION_LEVEL +zstd_comp_val := $(CONFIG_ZSTD_COMPRESSION_LEVEL) +ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0) +zstd_comp_val += --ultra +endif +endif + quiet_cmd_zstd = ZSTD $@ - cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@ + cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@ quiet_cmd_zstd22 = ZSTD22 $@ - cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@ + cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@ quiet_cmd_zstd22_with_size = ZSTD22 $@ - cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ + cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@ # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 10df89b9ef67..8b00761aed5a 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP $@ cmd_gzip = $(KGZIP) -n -f $< quiet_cmd_xz = XZ $@ cmd_xz = $(XZ) --lzma2=dict=2MiB -f $< +ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA quiet_cmd_zstd = ZSTD $@ - cmd_zstd = $(ZSTD) -T0 --rm -f -q $< + cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $< +else +quiet_cmd_zstd = ZSTD $@ + cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $< +endif $(dst)/%.ko.gz: $(dst)/%.ko FORCE $(call cmd,gzip) -- 2.40.0 From 3d55bb5dbf5d659d7b7ecea9ab8f0b26478a5272 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 6 Feb 2023 09:54:19 +0100 Subject: [PATCH 05/17] clr Signed-off-by: Peter Jung --- arch/x86/kernel/tsc.c | 3 ++ arch/x86/mm/fault.c | 4 +- drivers/cpufreq/intel_pstate.c | 7 ++++ drivers/idle/intel_idle.c | 50 ++++++++++++------------ drivers/input/serio/i8042.c | 10 ++--- drivers/net/dummy.c | 2 +- drivers/pci/pci.c | 2 +- drivers/powercap/intel_rapl_common.c | 2 +- drivers/thermal/intel/intel_powerclamp.c | 10 +++++ fs/xattr.c | 15 +++---- include/linux/jbd2.h | 2 +- include/linux/wait.h | 2 + include/uapi/linux/if_bonding.h | 2 +- init/do_mounts.c | 16 +++++++- kernel/locking/rwsem.c | 4 +- kernel/sched/wait.c | 24 ++++++++++++ kernel/watchdog.c | 2 +- lib/raid6/algos.c | 4 +- mm/ksm.c | 11 ++++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp.c | 4 +- 21 files changed, 123 insertions(+), 55 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cc..c2f80184fd33 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void) if (!constant_tsc || !mask) return 0; + if (cpu != 0) + return cpu_data(0).loops_per_jiffy; + sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8..1a14f52added 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", loglvl, tsk->comm, task_pid_nr(tsk), address, - (void *)regs->ip, (void *)regs->sp, error_code); + (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); print_vma_addr(KERN_CONT " in ", regs->ip); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6ff73c30769f..46516074bfd0 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -365,6 +365,13 @@ static void intel_pstate_set_itmt_prio(int cpu) * update them at any time after it has been called. */ sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); + /* + * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. + * In this case we can't use CPPC.highest_perf to enable ITMT. + * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. + */ + if (cppc_perf.highest_perf == 0xff) + cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); if (max_highest_perf <= min_highest_perf) { if (cppc_perf.highest_perf > max_highest_perf) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f060ac7376e6..1cd277c8f77f 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -572,7 +572,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -580,7 +580,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, - .target_residency = 100, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -588,7 +588,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -596,7 +596,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 1500, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -604,7 +604,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -612,7 +612,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 5000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -620,7 +620,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -640,7 +640,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -648,7 +648,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 40, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -656,7 +656,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -664,7 +664,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -672,7 +672,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 4000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -680,7 +680,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 7000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -688,7 +688,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -709,7 +709,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -717,7 +717,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 70, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -725,7 +725,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 85, - .target_residency = 200, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -733,7 +733,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x33", .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 124, - .target_residency = 800, + .target_residency = 3000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -741,7 +741,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 200, - .target_residency = 800, + .target_residency = 3200, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -749,7 +749,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 480, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -757,7 +757,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 890, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -778,7 +778,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 300, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -807,7 +807,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 4, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -815,7 +815,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 170, - .target_residency = 600, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -981,7 +981,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 2, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 6dac7c1853a5..fab04cd8a7a0 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; - pr_err("Failed to enable KBD port\n"); + pr_info("Failed to enable KBD port\n"); return -EIO; } @@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; - pr_err("Failed to enable AUX port\n"); + pr_info("Failed to enable AUX port\n"); return -EIO; } @@ -732,7 +732,7 @@ static int i8042_check_mux(void) i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - pr_err("Failed to disable AUX port, can't use MUX\n"); + pr_info("Failed to disable AUX port, can't use MUX\n"); return -EIO; } @@ -955,7 +955,7 @@ static int i8042_controller_selftest(void) do { if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { - pr_err("i8042 controller selftest timeout\n"); + pr_info("i8042 controller selftest timeout\n"); return -ENODEV; } @@ -977,7 +977,7 @@ static int i8042_controller_selftest(void) pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else - pr_err("i8042 controller selftest failed\n"); + pr_info("i8042 controller selftest failed\n"); return -EIO; #endif } diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index aa0fc00faecb..b93a4d71be29 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -43,7 +43,7 @@ #define DRV_NAME "dummy" -static int numdummies = 1; +static int numdummies = 0; /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 98d841a7b45b..8b4fcab1677e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -62,7 +62,7 @@ struct pci_pme_device { struct pci_dev *dev; }; -#define PME_TIMEOUT 1000 /* How long between PME checks */ +#define PME_TIMEOUT 4000 /* How long between PME checks */ static void pci_dev_d3_sleep(struct pci_dev *dev) { diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 26d00b1853b4..3e239d6548b5 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1518,7 +1518,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (!id) { - pr_err("driver does not support CPU family %d model %d\n", + pr_info("driver does not support CPU family %d model %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 2f4cbfdf26a0..2d297a1cfa34 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -636,6 +636,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .set_cur_state = powerclamp_set_cur_state, }; +static const struct x86_cpu_id amd_cpu[] = { + { X86_VENDOR_AMD }, + {}, +}; + static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), {} @@ -645,6 +650,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { + if (x86_match_cpu(amd_cpu)){ + pr_info("Intel PowerClamp does not support AMD CPUs\n"); + return -ENODEV; + } + if (!x86_match_cpu(intel_powerclamp_ids)) { pr_err("CPU does not support MWAIT\n"); return -ENODEV; diff --git a/fs/xattr.c b/fs/xattr.c index 427b8cea1f96..118db06a8370 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -122,16 +122,17 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, } /* - * In the user.* namespace, only regular files and directories can have - * extended attributes. For sticky directories, only the owner and - * privileged users can write attributes. + * In the user.* namespace, only regular files, symbolic links, and + * directories can have extended attributes. For symbolic links and + * sticky directories, only the owner and privileged users can write + * attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && - !inode_owner_or_capable(mnt_userns, inode)) + if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) + || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) + && !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0b7242370b56..16b8fc483b3d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -45,7 +45,7 @@ /* * The default maximum commit age, in seconds. */ -#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 +#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 #ifdef CONFIG_JBD2_DEBUG /* diff --git a/include/linux/wait.h b/include/linux/wait.h index a0307b516b09..edc21128f387 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); @@ -1192,6 +1193,7 @@ do { \ */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837d..bf8e2af101a3 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -82,7 +82,7 @@ #define BOND_STATE_ACTIVE 0 /* link is active */ #define BOND_STATE_BACKUP 1 /* link is backup */ -#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ +#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 811e94daf0a8..06fef7f97c02 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name) if (strcmp(name, "/dev/ram") == 0) return Root_RAM0; #ifdef CONFIG_BLOCK - if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9); + if (strncmp(name, "PARTUUID=", 9) == 0) { + dev_t res; + int needtowait = 40<<1; + res = devt_from_partuuid(name + 9); + while (!res && needtowait) { + /* waiting 0.5 sec */ + msleep(500); + res = devt_from_partuuid(name + 9); + needtowait--; + } + return res; + } if (strncmp(name, "PARTLABEL=", 10) == 0) return devt_from_partlabel(name + 10); if (strncmp(name, "/dev/", 5) == 0) @@ -612,7 +622,9 @@ void __init prepare_namespace(void) * For example, it is not atypical to wait 5 seconds here * for the touchpad of a laptop to initialize. */ + async_synchronize_full(); wait_for_device_probe(); + async_synchronize_full(); md_run_setup(); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 84d5b649b95f..e341ca8731f7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -754,6 +754,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; + int i = 0; lockdep_assert_preemption_disabled(); @@ -790,7 +791,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - cpu_relax(); + if (i++ > 1000) + cpu_relax(); } return state; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 133b74730738..1647fb8662eb 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -293,6 +304,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); + set_current_state(state); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { wq_entry->flags = flags; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8e61f21e7e33..be1439d38f26 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; -int __read_mostly watchdog_thresh = 10; +int __read_mostly watchdog_thresh = 40; static int __read_mostly nmi_watchdog_available; struct cpumask watchdog_cpumask __read_mostly; diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 39b74221f4a7..ec3eab8cd6b1 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -128,8 +128,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) for (best = NULL, algo = raid6_recov_algos; *algo; algo++) if (!best || (*algo)->priority > best->priority) - if (!(*algo)->valid || (*algo)->valid()) + if (!(*algo)->valid || (*algo)->valid()) { best = *algo; + break; + } if (best) { raid6_2data_recov = best->data2; diff --git a/mm/ksm.c b/mm/ksm.c index c19fcca9bc03..7009cf42be76 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2427,9 +2427,14 @@ static int ksm_scan_thread(void *nothing) if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); - wait_event_interruptible_timeout(ksm_iter_wait, - sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), - msecs_to_jiffies(sleep_ms)); + if (sleep_ms >= 1000) + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(round_jiffies_relative(sleep_ms))); + else + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7152ede18f11..5624ebc517a9 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -606,7 +606,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, + prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 34207ec358d8..4fff39d5e182 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4813,8 +4813,8 @@ void __init tcp_init(void) tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_wshare = min(16UL*1024*1024, limit); + max_rshare = min(16UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; -- 2.40.0 From 9b63f073cd891284685c103dd795cbe84d6a099f Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 14 Mar 2023 20:51:15 +0100 Subject: [PATCH 06/17] fixes Signed-off-by: Peter Jung --- Documentation/ABI/stable/sysfs-block | 10 + .../testing/sysfs-class-led-trigger-blkdev | 78 ++ Documentation/leds/index.rst | 1 + Documentation/leds/ledtrig-blkdev.rst | 158 +++ arch/x86/mm/tlb.c | 2 +- arch/x86/power/hibernate.c | 2 +- block/bfq-iosched.c | 2 + block/blk-sysfs.c | 3 + block/blk-wbt.c | 26 +- block/blk-wbt.h | 17 +- block/elevator.c | 8 +- block/elevator.h | 5 +- drivers/leds/trigger/Kconfig | 9 + drivers/leds/trigger/Makefile | 1 + drivers/leds/trigger/ledtrig-blkdev.c | 1220 +++++++++++++++++ fs/eventpoll.c | 2 +- include/linux/maple_tree.h | 6 - include/linux/pageblock-flags.h | 2 +- include/uapi/linux/futex.h | 13 + kernel/futex/syscalls.c | 75 +- kernel/kthread.c | 5 + kernel/padata.c | 4 +- lib/maple_tree.c | 44 +- lib/string.c | 10 +- mm/compaction.c | 75 +- mm/internal.h | 6 +- mm/memory-tiers.c | 2 +- mm/z3fold.c | 2 - mm/zsmalloc.c | 3 - scripts/Makefile.vmlinux_o | 2 +- scripts/mod/modpost.c | 4 +- sound/pci/hda/cs35l41_hda.c | 2 +- tools/objtool/elf.c | 27 +- tools/objtool/include/objtool/elf.h | 2 + 34 files changed, 1712 insertions(+), 116 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev create mode 100644 Documentation/leds/ledtrig-blkdev.rst create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index cd14ecb3c9a5..853cb2601242 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: devices that support receiving integrity metadata. +What: /sys/block//linked_leds +Date: October 2022 +Contact: Ian Pilcher +Description: + Directory that contains symbolic links to all LEDs that + are associated with (linked to) this block device by the + blkdev LED trigger. Only present when at least one LED + is linked. (See Documentation/leds/ledtrig-blkdev.rst.) + + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev new file mode 100644 index 000000000000..45275eb0bad3 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev @@ -0,0 +1,78 @@ +What: /sys/class/leds//blink_time +Date: October 2022 +Contact: Ian Pilcher +Description: + Time (in milliseconds) that the LED will be on during a single + "blink". + +What: /sys/class/leds//check_interval +Date: October 2022 +Contact: Ian Pilcher +Description: + Interval (in milliseconds) between checks of the block devices + linked to this LED. The LED will be blinked if the correct type + of activity (see blink_on_{read,write,discard,flush} attributes) + has occurred on any of the linked devices since the previous + check. + +What: /sys/class/leds//blink_on_read +Date: October 2022 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to read activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_write +Date: October 2022 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to write activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_discard +Date: October 2022 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to discard activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_flush +Date: October 2022 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to cache flush activity on any of its linked block devices. + +What: /sys/class/leds//link_dev_by_path +Date: October 2022 +Contact: Ian Pilcher +Description: + Associate a block device with this LED by writing the path to + the device special file (e.g. /dev/sda) to this attribute. + Symbolic links are followed. + +What: /sys/class/leds//unlink_dev_by_path +Date: October 2022 +Contact: Ian Pilcher +Description: + Remove the association between this LED and a block device by + writing the path to the device special file (e.g. /dev/sda) to + this attribute. Symbolic links are followed. + +What: /sys/class/leds//unlink_dev_by_name +Date: October 2022 +Contact: Ian Pilcher +Description: + Remove the association between this LED and a block device by + writing the kernel name of the device (e.g. sda) to this + attribute. + +What: /sys/class/leds//linked_devices +Date: October 2022 +Contact: Ian Pilcher +Description: + Directory containing links to all block devices that are + associated with this LED. (Note that the names of the + symbolic links in this directory are *kernel* names, which + may not match the device special file paths written to + link_device and unlink_device.) diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst index e5d63b940045..e3c24e468cbc 100644 --- a/Documentation/leds/index.rst +++ b/Documentation/leds/index.rst @@ -10,6 +10,7 @@ LEDs leds-class leds-class-flash leds-class-multicolor + ledtrig-blkdev ledtrig-oneshot ledtrig-transient ledtrig-usbport diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst new file mode 100644 index 000000000000..9ff5b99de451 --- /dev/null +++ b/Documentation/leds/ledtrig-blkdev.rst @@ -0,0 +1,158 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================= +Block Device (blkdev) LED Trigger +================================= + +Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or +``CONFIG_LEDS_TRIGGER_BLKDEV=m``. + +See also: + +* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev`` +* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block//linked_leds``) + +Overview +======== + +.. note:: + The examples below use ```` to refer to the name of a + system-specific LED. If no suitable LED is available on a test + system (in a virtual machine, for example), it is possible to + use a userspace LED. (See ``Documentation/leds/uleds.rst``.) + +Verify that the ``blkdev`` LED trigger is available:: + + # grep blkdev /sys/class/leds//trigger + ... rfkill-none blkdev + +(If the previous command produces no output, you may need to load the trigger +module - ``modprobe ledtrig_blkdev``. If the module is not available, check +the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.) + +Associate the LED with the ``blkdev`` LED trigger:: + + # echo blkdev > /sys/class/leds//trigger + + # cat /sys/class/leds//trigger + ... rfkill-none [blkdev] + +Note that several new device attributes are available in the +``/sys/class/leds/`` directory. + +* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are + used to manage the set of block devices associated with this LED. The LED + will blink when activity occurs on any of its linked devices. + +* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and + ``blink_on_flush`` are boolean values that determine whether the LED will + blink when a particular type of activity is detected on one of its linked + block devices. + +* ``blink_time`` is the duration (in milliseconds) of each blink of this LED. + (The minimum value is 10 milliseconds.) + +* ``check_interval`` is the frequency (in milliseconds) with which block devices + linked to this LED will be checked for activity and the LED blinked (if the + correct type of activity has occurred). + +* The ``linked_devices`` directory will contain a symbolic link to every device + that is associated with this LED. + +Link a block device to the LED:: + + # echo /dev/sda > /sys/class/leds//link_dev_by_path + + # ls /sys/class/leds//linked_devices + sda + +(The value written to ``link_dev_by_path`` must be the path of the device +special file, such as ``/dev/sda``, that represents the block device - or the +path of a symbolic link to such a device special file.) + +Activity on the device will now cause the LED to blink. The duration of each +blink (in milliseconds) can be adjusted by setting +``/sys/class/leds//blink_time``. (But see **check_interval and +blink_time** below.) + +Associate a second device with the LED:: + + # echo /dev/sdb > /sys/class/leds//link_dev_by_path + + # ls /sys/class/leds//linked_devices + sda sdb + +When a block device is linked to one or more LEDs, the LEDs are linked from +the device's ``linked_leds`` directory:: + + # ls /sys/class/block/sd{a,b}/linked_leds + /sys/class/block/sda/linked_leds: + + + /sys/class/block/sdb/linked_leds: + + +(The ``linked_leds`` directory only exists when the block device is linked to +at least one LED.) + +``check_interval`` and ``blink_time`` +===================================== + +* By default, linked block devices are checked for activity every 100 + milliseconds. This frequency can be changed for an LED via the + ``/sys/class/leds//check_interval`` attribute. (The minimum value is 25 + milliseconds.) + +* All block devices associated with an LED are checked for activity every + ``check_interval`` milliseconds, and a blink is triggered if the correct type + of activity (as determined by the LED's ``blink_on_*`` attributes) is + detected. The duration of an LED's blink is determined by its ``blink_time`` + attribute. Thus (when the correct type of activity is detected), the LED will + be on for ``blink_time`` milliseconds and off for + ``check_interval - blink_time`` milliseconds. + +* The LED subsystem ignores new blink requests for an LED that is already in + in the process of blinking, so setting a ``blink_time`` greater than or equal + to ``check_interval`` will cause some blinks to be missed. + +* Because of processing times, scheduling latencies, etc., avoiding missed + blinks actually requires a difference of at least a few milliseconds between + the ``blink_time`` and ``check_interval``. The required difference is likely + to vary from system to system. As a reference, a Thecus N5550 NAS requires a + difference of 7 milliseconds (e.g. ``check_interval == 100``, + ``blink_time == 93``). + +* The default values (``check_interval == 100``, ``blink_time == 75``) cause the + LED associated with a continuously active device to blink rapidly. For a more + "always on" effect, increase the ``blink_time`` (but not too much; see the + previous bullet). + +Other Notes +=========== + +* Many (possibly all) types of block devices work with this trigger, including: + + * SCSI (including SATA and USB) hard disk drives and SSDs + * SCSI (including SATA and USB) optical drives + * NVMe SSDs + * SD cards + * loopback block devices (``/dev/loop*``) + * device mapper devices, such as LVM logical volumes + * MD RAID devices + * zRAM compressed RAM-disks + * partitions on block devices that support them + +* The names of the symbolic links in ``/sys/class/leds//linked_devices`` + are **kernel** names, which may not match the paths used for + ``link_dev_by_path`` and ``unlink_dev_by_path``. This is most likely when a + symbolic link is used to refer to the device (as is common with logical + volumes), but it can be true for any device, because nothing prevents the + creation of device special files with arbitrary names (e.g. + ``sudo mknod /foo b 8 0``). + + Kernel names can be used to unlink block devices from LEDs by writing them to + the LED's ``unlink_dev_by_name`` attribute. + +* The ``blkdev`` LED trigger supports many-to-many device/LED associations. + A device can be associated with multiple LEDs, and an LED can be associated + with multiple devices. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c1e31e9a85d7..92d73ccede70 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1205,7 +1205,7 @@ void __flush_tlb_all(void) */ VM_WARN_ON_ONCE(preemptible()); - if (boot_cpu_has(X86_FEATURE_PGE)) { + if (cpu_feature_enabled(X86_FEATURE_PGE)) { __flush_tlb_global(); } else { /* diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index e94e0050a583..6f955eb1e163 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -159,7 +159,7 @@ int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6cae412a33a0..72840a0fe8ae 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7157,6 +7157,7 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); + clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); wbt_enable_default(bfqd->queue); kfree(bfqd); @@ -7344,6 +7345,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); wbt_disable_default(q); blk_stat_enable_accounting(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e71b3b43927c..7b98c7074771 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) if (!wbt_rq_qos(q)) return -EINVAL; + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index c293e08b301f..68a774d7a7c9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -27,6 +27,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" +#include "elevator.h" #define CREATE_TRACE_POINTS #include @@ -422,6 +423,14 @@ static void wbt_update_limits(struct rq_wb *rwb) rwb_wake_all(rwb); } +bool wbt_disabled(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + + return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || + RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; +} + u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); @@ -435,8 +444,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val) struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; + RQWB(rqos)->min_lat_nsec = val; - RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + if (val) + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + else + RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; + wbt_update_limits(RQWB(rqos)); } @@ -638,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) */ void wbt_enable_default(struct request_queue *q) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos; + bool disable_flag = q->elevator && + test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); /* Throttling already enabled? */ + rqos = wbt_rq_qos(q); if (rqos) { - if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + if (!disable_flag && + RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } @@ -651,7 +669,7 @@ void wbt_enable_default(struct request_queue *q) if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) + if (queue_is_mq(q) && !disable_flag) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 7e44eccc676d..e3ea6e7e2900 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -28,13 +28,15 @@ enum { }; /* - * Enable states. Either off, or on by default (done at init time), - * or on through manual setup in sysfs. + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. */ enum { - WBT_STATE_ON_DEFAULT = 1, - WBT_STATE_ON_MANUAL = 2, - WBT_STATE_OFF_DEFAULT + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ }; struct rq_wb { @@ -94,6 +96,7 @@ void wbt_enable_default(struct request_queue *); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); +bool wbt_disabled(struct request_queue *); void wbt_set_write_cache(struct request_queue *, bool); @@ -125,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q) { return 0; } +static inline bool wbt_disabled(struct request_queue *q) +{ + return true; +} #endif /* CONFIG_BLK_WBT */ diff --git a/block/elevator.c b/block/elevator.c index 389cb51389af..3b762fa45e07 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -512,7 +512,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); - e->registered = 1; + set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } @@ -523,13 +523,9 @@ void elv_unregister_queue(struct request_queue *q) lockdep_assert_held(&q->sysfs_lock); - if (e && e->registered) { - struct elevator_queue *e = q->elevator; - + if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); - - e->registered = 0; } } diff --git a/block/elevator.h b/block/elevator.h index 3f0593b3bf9d..75382471222d 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -100,10 +100,13 @@ struct elevator_queue void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; - unsigned int registered:1; + unsigned long flags; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; +#define ELEVATOR_FLAG_REGISTERED 0 +#define ELEVATOR_FLAG_DISABLE_WBT 1 + /* * block elevator interface */ diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index dc6816d36d06..bda249068182 100644 --- a/drivers/leds/trigger/Kconfig +++ b/drivers/leds/trigger/Kconfig @@ -154,4 +154,13 @@ config LEDS_TRIGGER_TTY When build as a module this driver will be called ledtrig-tty. +config LEDS_TRIGGER_BLKDEV + tristate "LED Trigger for block devices" + depends on BLOCK + help + The blkdev LED trigger allows LEDs to be controlled by block device + activity (reads and writes). + + See Documentation/leds/ledtrig-blkdev.rst. + endif # LEDS_TRIGGERS diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile index 25c4db97cdd4..d53bab5d93f1 100644 --- a/drivers/leds/trigger/Makefile +++ b/drivers/leds/trigger/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV) += ledtrig-netdev.o obj-$(CONFIG_LEDS_TRIGGER_PATTERN) += ledtrig-pattern.o obj-$(CONFIG_LEDS_TRIGGER_AUDIO) += ledtrig-audio.o obj-$(CONFIG_LEDS_TRIGGER_TTY) += ledtrig-tty.o +obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c new file mode 100644 index 000000000000..8614e308fadc --- /dev/null +++ b/drivers/leds/trigger/ledtrig-blkdev.c @@ -0,0 +1,1220 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Block device LED trigger + * + * Copyright 2021-2022 Ian Pilcher + */ + +#include +#include +#include +#include +#include + +/** + * DOC: Overview + * + * The ``blkdev`` LED trigger works by periodically checking the activity + * counters of block devices that have been linked to one or more LEDs and + * blinking those LED(s) if the correct type of activity has occurred. The + * periodic check is scheduled with the Linux kernel's deferred work facility. + * + * Trigger-specific data about block devices and LEDs is stored in two data + * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led + * (a "BTL"). Each structure contains a &struct xarray that holds links to any + * linked devices of the other type. I.e. &blkdev_trig_bdev.linked_btls + * contains links to all BTLs whose LEDs have been linked to the BTB's block + * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose + * block devices have been linked to the BTL's LED. Thus, a block device can + * be linked to more than one LED, and an LED can be linked to more than one + * block device. + */ + +/* Default, minimum & maximum blink duration (milliseconds) */ +#define BLKDEV_TRIG_BLINK_DEF 75 +#define BLKDEV_TRIG_BLINK_MIN 10 +#define BLKDEV_TRIG_BLINK_MAX 86400000 /* 24 hours */ + +/* Default, minimum & maximum activity check interval (milliseconds) */ +#define BLKDEV_TRIG_CHECK_DEF 100 +#define BLKDEV_TRIG_CHECK_MIN 25 +#define BLKDEV_TRIG_CHECK_MAX 86400000 /* 24 hours */ + +/* + * If blkdev_trig_check() can't lock the mutex, how long to wait before trying + * again (milliseconds) + */ +#define BLKDEV_TRIG_CHECK_RETRY 5 + +/* Mode argument for calls to blkdev_get_by_path() and blkdev_put() */ +#define BLKDEV_TRIG_FMODE 0 + +/** + * struct blkdev_trig_bdev - Trigger-specific data about a block device. + * @last_checked: Time (in jiffies) at which the trigger last checked this + * block device for activity. + * @last_activity: Time (in jiffies) at which the trigger last detected + * activity of each type. + * @ios: Activity counter values for each type, corresponding to + * the timestamps in &last_activity. + * @index: &xarray index, so the BTB can be included in one or more + * &blkdev_trig_led.linked_btbs. + * @bdev: The block device. + * @linked_btls: The BTLs that represent the LEDs linked to the BTB's + * block device. + * + * Every block device linked to at least one LED gets a "BTB." A BTB is created + * when a block device that is not currently linked to any LEDs is linked to an + * LED. + * + * A BTB is freed when one of the following occurs: + * + * * The number of LEDs linked to the block device becomes zero, because it has + * been unlinked from its last LED using the trigger's &sysfs interface. + * + * * The number of LEDs linked to the block device becomes zero, because the + * last LED to which it was linked has been disassociated from the trigger + * (which happens automatically if the LED device is removed from the system). + * + * * The BTB's block device is removed from the system. To accomodate this + * scenario, BTB's are created as device resources, so that the release + * function will be called by the driver core when the device is removed. + */ +struct blkdev_trig_bdev { + unsigned long last_checked; + unsigned long last_activity[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long index; + struct block_device *bdev; + struct xarray linked_btls; +}; + +/** + * struct blkdev_trig_led - Trigger-specific data about an LED. + * @last_checked: Time (in jiffies) at which the trigger last checked the + * the block devices linked to this LED for activity. + * @index: &xarray index, so the BTL can be included in one or more + * &blkdev_trig_bdev.linked_btls. + * @mode: Bitmask for types of block device activity that will + * cause this LED to blink --- reads, writes, discards, + * etc. + * @led: The LED device. + * @blink_msec: Duration of a blink (milliseconds). + * @check_jiffies: Frequency with which block devices linked to this LED + * should be checked for activity (jiffies). + * @linked_btbs: The BTBs that represent the block devices linked to the + * BTL's LED. + * @all_btls_node: The BTL's node in the module's list of all BTLs. + * + * Every LED associated with the block device trigger gets a "BTL." A BTL is + * created when the trigger is "activated" on an LED (usually by writing + * ``blkdev`` to the LED's &sysfs &trigger attribute). A BTL is freed wnen its + * LED is disassociated from the trigger, either through the trigger's &sysfs + * interface or because the LED device is removed from the system. + */ +struct blkdev_trig_led { + unsigned long last_checked; + unsigned long index; + unsigned long mode; /* must be ulong for atomic bit ops */ + struct led_classdev *led; + unsigned int blink_msec; + unsigned int check_jiffies; + struct xarray linked_btbs; + struct hlist_node all_btls_node; +}; + +/* Protects everything except atomic LED attributes */ +static DEFINE_MUTEX(blkdev_trig_mutex); + +/* BTB device resource release function */ +static void blkdev_trig_btb_release(struct device *dev, void *res); + +/* Index for next BTB or BTL */ +static unsigned long blkdev_trig_next_index; + +/* All LEDs associated with the trigger */ +static HLIST_HEAD(blkdev_trig_all_btls); + +/* Delayed work to periodically check for activity & blink LEDs */ +static void blkdev_trig_check(struct work_struct *work); +static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check); + +/* When is the delayed work scheduled to run next (jiffies) */ +static unsigned long blkdev_trig_next_check; + +/* Total number of BTB-to-BTL links */ +static unsigned int blkdev_trig_link_count; + +/* Empty sysfs attribute list for next 2 declarations */ +static struct attribute *blkdev_trig_attrs_empty[] = { NULL }; + +/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */ +static const struct attribute_group blkdev_trig_linked_leds = { + .name = "linked_leds", + .attrs = blkdev_trig_attrs_empty, +}; + +/* linked_devices sysfs directory for each LED associated with the trigger */ +static const struct attribute_group blkdev_trig_linked_devs = { + .name = "linked_devices", + .attrs = blkdev_trig_attrs_empty, +}; + + +/* + * + * Delayed work to check for activity & blink LEDs + * + */ + +/** + * blkdev_trig_blink() - Blink an LED, if the correct type of activity has + * occurred on the block device. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + * Return: &true if the LED is blinked, &false if not. + */ +static bool blkdev_trig_blink(const struct blkdev_trig_led *btl, + const struct blkdev_trig_bdev *btb) +{ + unsigned long mode, mask, delay_on, delay_off; + enum stat_group i; + + mode = READ_ONCE(btl->mode); + + for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) { + + if (!(mode & mask)) + continue; + + if (time_before_eq(btb->last_activity[i], btl->last_checked)) + continue; + + delay_on = READ_ONCE(btl->blink_msec); + delay_off = 1; /* 0 leaves LED turned on */ + + led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0); + return true; + } + + return false; +} + +/** + * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps. + * @btb: The BTB + * @now: Timestamp (in jiffies) + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb, + unsigned long now) +{ + unsigned long new_ios; + enum stat_group i; + + for (i = STAT_READ; i <= STAT_FLUSH; ++i) { + + new_ios = part_stat_read(btb->bdev, ios[i]); + + if (new_ios != btb->ios[i]) { + btb->ios[i] = new_ios; + btb->last_activity[i] = now; + } + } + + btb->last_checked = now; +} + +/** + * blkdev_trig_check() - Check linked devices for activity and blink LEDs. + * @work: Delayed work (&blkdev_trig_work) + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + */ +static void blkdev_trig_check(struct work_struct *work) +{ + struct blkdev_trig_led *btl; + struct blkdev_trig_bdev *btb; + unsigned long index, delay, now, led_check, led_delay; + bool blinked; + + if (!mutex_trylock(&blkdev_trig_mutex)) { + delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY); + goto exit_reschedule; + } + + now = jiffies; + delay = ULONG_MAX; + + hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) { + + led_check = btl->last_checked + btl->check_jiffies; + + if (time_before_eq(led_check, now)) { + + blinked = false; + + xa_for_each (&btl->linked_btbs, index, btb) { + + if (btb->last_checked != now) + blkdev_trig_update_btb(btb, now); + if (!blinked) + blinked = blkdev_trig_blink(btl, btb); + } + + btl->last_checked = now; + led_delay = btl->check_jiffies; + + } else { + led_delay = led_check - now; + } + + if (led_delay < delay) + delay = led_delay; + } + + mutex_unlock(&blkdev_trig_mutex); + +exit_reschedule: + WARN_ON_ONCE(delay == ULONG_MAX); + WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay)); +} + +/** + * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new + * LED is added to the schedule. + * @btl: The BTL that represents the LED + * + * Called when the number of block devices to which an LED is linked becomes + * non-zero. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl) +{ + unsigned long delay = READ_ONCE(btl->check_jiffies); + unsigned long check_by = jiffies + delay; + + /* + * If no other LED-to-block device links exist, simply schedule the + * delayed work according to this LED's check_interval attribute + * (check_jiffies). + */ + if (blkdev_trig_link_count == 0) { + WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay)); + blkdev_trig_next_check = check_by; + return; + } + + /* + * If the next check is already scheduled to occur soon enough to + * accomodate this LED's check_interval, the schedule doesn't need + * to be changed. + */ + if (time_after_eq(check_by, blkdev_trig_next_check)) + return; + + /* + * Modify the schedule, so that the delayed work runs soon enough for + * this LED. + */ + WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay)); + blkdev_trig_next_check = check_by; +} + + +/* + * + * Linking and unlinking LEDs and block devices + * + */ + +/** + * blkdev_trig_link() - Link a block device to an LED. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + * Return: &0 on success, negative &errno on error. + */ +static int blkdev_trig_link(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + bool led_first_link; + int err; + + led_first_link = xa_empty(&btl->linked_btbs); + + err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL); + if (err) + return err; + + err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL); + if (err) + goto error_erase_btl; + + /* Create /sys/class/block//linked_leds/ symlink */ + err = sysfs_add_link_to_group(bdev_kobj(btb->bdev), + blkdev_trig_linked_leds.name, + &btl->led->dev->kobj, btl->led->name); + if (err) + goto error_erase_btb; + + /* Create /sys/class/leds//linked_devices/ symlink */ + err = sysfs_add_link_to_group(&btl->led->dev->kobj, + blkdev_trig_linked_devs.name, + bdev_kobj(btb->bdev), + dev_name(&btb->bdev->bd_device)); + if (err) + goto error_remove_symlink; + + /* + * If this is the first block device linked to this LED, the delayed + * work schedule may need to be changed. + */ + if (led_first_link) + blkdev_trig_sched_led(btl); + + ++blkdev_trig_link_count; + + return 0; + +error_remove_symlink: + sysfs_remove_link_from_group(bdev_kobj(btb->bdev), + blkdev_trig_linked_leds.name, + btl->led->name); +error_erase_btb: + xa_erase(&btl->linked_btbs, btb->index); +error_erase_btl: + xa_erase(&btb->linked_btls, btl->index); + return err; +} + +/** + * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed. + * @btb: The BTB + * + * Does nothing if the BTB (block device) is still linked to at least one LED. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb) +{ + struct block_device *bdev = btb->bdev; + int err; + + if (xa_empty(&btb->linked_btls)) { + + sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); + err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release, + NULL, NULL); + WARN_ON(err); + } +} + +/** + * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of + * unlinking a block device from an LED. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * When a block device is unlinked from an LED, certain steps must be performed + * only if the block device is **not** being released. This function performs + * those steps that are **always** required, whether or not the block device is + * being released. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + --blkdev_trig_link_count; + + if (blkdev_trig_link_count == 0) + WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work)); + + xa_erase(&btb->linked_btls, btl->index); + xa_erase(&btl->linked_btbs, btb->index); + + /* Remove /sys/class/leds//linked_devices/ symlink */ + sysfs_remove_link_from_group(&btl->led->dev->kobj, + blkdev_trig_linked_devs.name, + dev_name(&btb->bdev->bd_device)); +} + +/** + * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is + * **not** being released. + * @btl: The BTL that represents the LED. + * @btb: The BTB that represents the block device. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + _blkdev_trig_unlink_always(btl, btb); + + /* Remove /sys/class/block//linked_leds/ symlink */ + sysfs_remove_link_from_group(bdev_kobj(btb->bdev), + blkdev_trig_linked_leds.name, + btl->led->name); + + blkdev_trig_put_btb(btb); +} + +/** + * blkdev_trig_unlink_release() - Unlink an LED from a block device that is + * being released. + * @btl: The BTL that represents the LED + * @btb: The BTB that represents the block device + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + */ +static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl, + struct blkdev_trig_bdev *btb) +{ + _blkdev_trig_unlink_always(btl, btb); + + /* + * If the BTB is being released, the driver core has already removed the + * device's attribute groups, and the BTB will be freed automatically, + * so there's nothing else to do. + */ +} + + +/* + * + * BTB creation + * + */ + +/** + * blkdev_trig_btb_release() - BTB device resource release function. + * @dev: The block device + * @res: The BTB + * + * Called by the driver core when a block device with a BTB is removed. + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + */ +static void blkdev_trig_btb_release(struct device *dev, void *res) +{ + struct blkdev_trig_bdev *btb = res; + struct blkdev_trig_led *btl; + unsigned long index; + + mutex_lock(&blkdev_trig_mutex); + + xa_for_each (&btb->linked_btls, index, btl) + blkdev_trig_unlink_release(btl, btb); + + mutex_unlock(&blkdev_trig_mutex); +} + +/** + * blkdev_trig_get_bdev() - Get a block device by path. + * @path: The value written to an LED's &link_dev_by_path or + * &unlink_dev_by_path attribute, which should be the path to a + * special file that represents a block device + * @len: The number of characters in &path (not including its + * terminating null) + * + * The caller must call blkdev_put() when finished with the device. + * + * Context: Process context. + * Return: The block device, or an error pointer. + */ +static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len) +{ + struct block_device *bdev; + char *buf; + + buf = kmemdup(path, len + 1, GFP_KERNEL); /* +1 to include null */ + if (buf == NULL) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(strim(buf), BLKDEV_TRIG_FMODE, THIS_MODULE); + kfree(buf); + return bdev; +} + +/** + * blkdev_trig_get_btb() - Find or create the BTB for a block device. + * @path: The value written to an LED's &link_dev_by_path attribute, + * which should be the path to a special file that represents a + * block device + * @len: The number of characters in &path + * + * If a new BTB is created, because the block device was not previously linked + * to any LEDs, the block device's &linked_leds &sysfs directory is created. + * + * Context: Process context. Caller must hold &blkdev_trig_mutex. + * Return: Pointer to the BTB, error pointer on error. + */ +static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path, + size_t len) +{ + struct block_device *bdev; + struct blkdev_trig_bdev *btb; + int err; + + bdev = blkdev_trig_get_bdev(path, len); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, + NULL, NULL); + if (btb != NULL) { + err = 0; + goto exit_put_bdev; + } + + if (blkdev_trig_next_index == ULONG_MAX) { + err = -EOVERFLOW; + goto exit_put_bdev; + } + + btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL); + if (btb == NULL) { + err = -ENOMEM; + goto exit_put_bdev; + } + + err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); + if (err) + goto exit_free_btb; + + btb->index = blkdev_trig_next_index++; + btb->bdev = bdev; + xa_init(&btb->linked_btls); + + /* Populate BTB activity counters */ + blkdev_trig_update_btb(btb, jiffies); + + devres_add(&bdev->bd_device, btb); + +exit_free_btb: + if (err) + devres_free(btb); +exit_put_bdev: + blkdev_put(bdev, BLKDEV_TRIG_FMODE); + return err ? ERR_PTR(err) : btb; +} + + +/* + * + * Activating and deactivating the trigger on an LED + * + */ + +/** + * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is + * associated with the trigger. + * @led: The LED + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &0 on success, negative &errno on error. + */ +static int blkdev_trig_activate(struct led_classdev *led) +{ + struct blkdev_trig_led *btl; + int err; + + btl = kzalloc(sizeof(*btl), GFP_KERNEL); + if (btl == NULL) + return -ENOMEM; + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + goto exit_free; + + if (blkdev_trig_next_index == ULONG_MAX) { + err = -EOVERFLOW; + goto exit_unlock; + } + + btl->index = blkdev_trig_next_index++; + btl->last_checked = jiffies; + btl->mode = -1; /* set all bits */ + btl->led = led; + btl->blink_msec = BLKDEV_TRIG_BLINK_DEF; + btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF); + xa_init(&btl->linked_btbs); + + hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls); + led_set_trigger_data(led, btl); + +exit_unlock: + mutex_unlock(&blkdev_trig_mutex); +exit_free: + if (err) + kfree(btl); + return err; +} + +/** + * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is + * disassociated from the trigger. + * @led: The LED + * + * The LEDs subsystem also calls this function when an LED associated with the + * trigger is removed or when the trigger is unregistered (if the module is + * unloaded). + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + */ +static void blkdev_trig_deactivate(struct led_classdev *led) +{ + struct blkdev_trig_led *btl = led_get_trigger_data(led); + struct blkdev_trig_bdev *btb; + unsigned long index; + + mutex_lock(&blkdev_trig_mutex); + + xa_for_each (&btl->linked_btbs, index, btb) + blkdev_trig_unlink_norelease(btl, btb); + + hlist_del(&btl->all_btls_node); + kfree(btl); + + mutex_unlock(&blkdev_trig_mutex); +} + + +/* + * + * Link-related attribute store functions + * + */ + +/** + * link_dev_by_path_store() - &link_dev_by_path device attribute store function. + * @dev: The LED device + * @attr: The &link_dev_by_path attribute (&dev_attr_link_dev_by_path) + * @buf: The value written to the attribute, which should be the path to + * a special file that represents a block device to be linked to + * the LED (e.g. ``/dev/sda``) + * @count: The number of characters in &buf + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &count on success, negative &errno on error. + */ +static ssize_t link_dev_by_path_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + struct blkdev_trig_bdev *btb; + int err; + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + return err; + + btb = blkdev_trig_get_btb(buf, count); + if (IS_ERR(btb)) { + err = PTR_ERR(btb); + goto exit_unlock; + } + + if (xa_load(&btb->linked_btls, btl->index) != NULL) { + err = -EEXIST; + goto exit_put_btb; + } + + err = blkdev_trig_link(btl, btb); + +exit_put_btb: + if (err) + blkdev_trig_put_btb(btb); +exit_unlock: + mutex_unlock(&blkdev_trig_mutex); + return err ? : count; +} + +/** + * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store + * function. + * @dev: The LED device + * @attr: The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path) + * @buf: The value written to the attribute, which should be the path to + * a special file that represents a block device to be unlinked + * from the LED (e.g. ``/dev/sda``) + * @count: The number of characters in &buf + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &count on success, negative &errno on error. + */ +static ssize_t unlink_dev_by_path_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + struct block_device *bdev; + struct blkdev_trig_bdev *btb; + int err; + + bdev = blkdev_trig_get_bdev(buf, count); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + goto exit_put_bdev; + + btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, + NULL, NULL); + if (btb == NULL) { + err = -EUNATCH; /* bdev isn't linked to any LED */ + goto exit_unlock; + } + + if (xa_load(&btb->linked_btls, btl->index) == NULL) { + err = -EUNATCH; /* bdev isn't linked to this LED */ + goto exit_unlock; + } + + blkdev_trig_unlink_norelease(btl, btb); + +exit_unlock: + mutex_unlock(&blkdev_trig_mutex); +exit_put_bdev: + blkdev_put(bdev, BLKDEV_TRIG_FMODE); + return err ? : count; +} + +/** + * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store + * function. + * @dev: The LED device + * @attr: The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name) + * @buf: The value written to the attribute, which should be the kernel + * name of a block device to be unlinked from the LED (e.g. + * ``sda``) + * @count: The number of characters in &buf + * + * Context: Process context. Takes and releases &blkdev_trig_mutex. + * Return: &count on success, negative &errno on error. + */ +static ssize_t unlink_dev_by_name_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + struct blkdev_trig_bdev *btb; + unsigned long index; + int err; + + err = mutex_lock_interruptible(&blkdev_trig_mutex); + if (err) + return err; + + err = -EUNATCH; + + xa_for_each (&btl->linked_btbs, index, btb) { + + if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) { + blkdev_trig_unlink_norelease(btl, btb); + err = 0; + break; + } + } + + mutex_unlock(&blkdev_trig_mutex); + return err ? : count; +} + + +/* + * + * Atomic attribute show & store functions + * + */ + +/** + * blink_time_show() - &blink_time device attribute show function. + * @dev: The LED device + * @attr: The &blink_time attribute (&dev_attr_blink_time) + * @buf: Output buffer + * + * Writes the value of &blkdev_trig_led.blink_msec to &buf. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_time_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + + return sprintf(buf, "%u\n", READ_ONCE(btl->blink_msec)); +} + +/** + * blink_time_store() - &blink_time device attribute store function. + * @dev: The LED device + * @attr: The &blink_time attribute (&dev_attr_blink_time) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets &blkdev_trig_led.blink_msec to the value in &buf. + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_time_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + unsigned int value; + int err; + + err = kstrtouint(buf, 0, &value); + if (err) + return err; + + if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX) + return -ERANGE; + + WRITE_ONCE(btl->blink_msec, value); + return count; +} + +/** + * check_interval_show() - &check_interval device attribute show function. + * @dev: The LED device + * @attr: The &check_interval attribute (&dev_attr_check_interval) + * @buf: Output buffer + * + * Writes the value of &blkdev_trig_led.check_jiffies (converted to + * milliseconds) to &buf. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t check_interval_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + + return sprintf(buf, "%u\n", + jiffies_to_msecs(READ_ONCE(btl->check_jiffies))); +} + +/** + * check_interval_store() - &check_interval device attribute store function + * @dev: The LED device + * @attr: The &check_interval attribute (&dev_attr_check_interval) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting + * from milliseconds). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t check_interval_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct blkdev_trig_led *led = led_trigger_get_drvdata(dev); + unsigned int value; + int err; + + err = kstrtouint(buf, 0, &value); + if (err) + return err; + + if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX) + return -ERANGE; + + WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value)); + + return count; +} + +/** + * blkdev_trig_mode_show() - Helper for boolean attribute show functions. + * @led: The LED + * @buf: Output buffer + * @bit: Which bit to show + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf, + enum stat_group bit) +{ + return sprintf(buf, READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n"); +} + +/** + * blkdev_trig_mode_store() - Helper for boolean attribute store functions. + * @led: The LED + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * @bit: Which bit to set + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static int blkdev_trig_mode_store(struct blkdev_trig_led *led, + const char *buf, size_t count, + enum stat_group bit) +{ + bool set; + int err; + + err = kstrtobool(buf, &set); + if (err) + return err; + + if (set) + set_bit(bit, &led->mode); + else + clear_bit(bit, &led->mode); + + return count; +} + +/** + * blink_on_read_show() - &blink_on_read device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in + * &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_read_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_READ); +} + +/** + * blink_on_read_store() - &blink_on_read device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_read_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_READ); +} + +/** + * blink_on_write_show() - &blink_on_write device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in + * in &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_write_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_WRITE); +} + +/** + * blink_on_write_store() - &blink_on_write device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_write_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_WRITE); +} + +/** + * blink_on_flush_show() - &blink_on_flush device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in + * &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_flush_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_FLUSH); +} + +/** + * blink_on_flush_store() - &blink_on_flush device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_flush_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_FLUSH); +} + +/** + * blink_on_discard_show() - &blink_on_discard device attribute show function. + * @dev: The LED device + * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) + * @buf: Output buffer + * + * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in + * &blkdev_trig_led.mode is set or cleared. + * + * Context: Process context. + * Return: The number of characters written to &buf. + */ +static ssize_t blink_on_discard_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), + buf, STAT_DISCARD); +} + +/** + * blink_on_discard_store() - &blink_on_discard device attribute store function. + * @dev: The LED device + * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) + * @buf: The new value (as written to the &sysfs attribute) + * @count: The number of characters in &buf + * + * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf + * (interpretted as a boolean). + * + * Context: Process context. + * Return: &count on success, negative &errno on error. + */ +static ssize_t blink_on_discard_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), + buf, count, STAT_DISCARD); +} + +/* Device attributes */ +static DEVICE_ATTR_WO(link_dev_by_path); +static DEVICE_ATTR_WO(unlink_dev_by_path); +static DEVICE_ATTR_WO(unlink_dev_by_name); +static DEVICE_ATTR_RW(blink_time); +static DEVICE_ATTR_RW(check_interval); +static DEVICE_ATTR_RW(blink_on_read); +static DEVICE_ATTR_RW(blink_on_write); +static DEVICE_ATTR_RW(blink_on_flush); +static DEVICE_ATTR_RW(blink_on_discard); + +/* Device attributes in LED directory (/sys/class/leds//...) */ +static struct attribute *blkdev_trig_attrs[] = { + &dev_attr_link_dev_by_path.attr, + &dev_attr_unlink_dev_by_path.attr, + &dev_attr_unlink_dev_by_name.attr, + &dev_attr_blink_time.attr, + &dev_attr_check_interval.attr, + &dev_attr_blink_on_read.attr, + &dev_attr_blink_on_write.attr, + &dev_attr_blink_on_flush.attr, + &dev_attr_blink_on_discard.attr, + NULL +}; + +/* Unnamed attribute group == no subdirectory */ +static const struct attribute_group blkdev_trig_attr_group = { + .attrs = blkdev_trig_attrs, +}; + +/* Attribute groups for the trigger */ +static const struct attribute_group *blkdev_trig_attr_groups[] = { + &blkdev_trig_attr_group, /* /sys/class/leds//... */ + &blkdev_trig_linked_devs, /* /sys/class/leds//linked_devices/ */ + NULL +}; + +/* Trigger registration data */ +static struct led_trigger blkdev_trig_trigger = { + .name = "blkdev", + .activate = blkdev_trig_activate, + .deactivate = blkdev_trig_deactivate, + .groups = blkdev_trig_attr_groups, +}; + +/** + * blkdev_trig_init() - Block device LED trigger initialization. + * + * Registers the ``blkdev`` LED trigger. + * + * Return: &0 on success, negative &errno on failure. + */ +static int __init blkdev_trig_init(void) +{ + return led_trigger_register(&blkdev_trig_trigger); +} +module_init(blkdev_trig_init); + +/** + * blkdev_trig_exit() - Block device LED trigger module exit. + * + * Unregisters the ``blkdev`` LED trigger. + */ +static void __exit blkdev_trig_exit(void) +{ + led_trigger_unregister(&blkdev_trig_trigger); +} +module_exit(blkdev_trig_exit); + +MODULE_DESCRIPTION("Block device LED trigger"); +MODULE_AUTHOR("Ian Pilcher "); +MODULE_LICENSE("GPL v2"); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 64659b110973..8b5ca9f8f4bb 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1760,7 +1760,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, { int ret = default_wake_function(wq_entry, mode, sync, key); - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e594db58a0f1..815a27661517 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -12,7 +12,6 @@ #include #include /* #define CONFIG_MAPLE_RCU_DISABLED */ -/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ /* * Allocated nodes are mutable until they have been inserted into the tree, @@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) return mas->node == MAS_PAUSE; } -void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); -void mas_dup_store(struct ma_state *mas, void *entry); - /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, @@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas) * entry. * * Note: may return the zero entry. - * */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) @@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) } static inline unsigned int mt_height(const struct maple_tree *mt) - { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 5f1ae07d724b..97cda629c9e9 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -48,7 +48,7 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +#define pageblock_order PAGE_ALLOC_COSTLY_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 71a5df8d2689..cf7fc988c24c 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -22,6 +22,7 @@ #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_LOCK_PI2 13 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -68,6 +69,18 @@ struct futex_waitv { __u32 __reserved; }; +/** + * struct futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index a8074079b09e..26d6da72d494 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -142,6 +142,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_WAIT_MULTIPLE: return true; } return false; @@ -154,13 +155,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) return -EINVAL; *t = timespec64_to_ktime(*ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } +/** + * futex_read_wait_block - Read an array of futex_wait_block from userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function creates and allocate an array of futex_q (we zero it to + * initialize the fields) and then, for each futex_wait_block element from + * userspace, fill a futex_q element with proper values. + */ +inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count) +{ + unsigned int i; + struct futex_vector *futexv; + struct futex_wait_block fwb; + struct futex_wait_block __user *entry = + (struct futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_WAITV_MAX) + return ERR_PTR(-EINVAL); + + futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(futexv); + return ERR_PTR(-EFAULT); + } + + futexv[i].w.flags = FUTEX_32; + futexv[i].w.val = fwb.val; + futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr); + futexv[i].q = futex_q_init; + } + + return futexv; +} + +int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to); + +int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count) +{ + int ret; + struct futex_vector *vs; + struct hrtimer_sleeper *to = NULL, timeout; + + to = futex_setup_timer(abs_time, &timeout, 0, 0); + + vs = futex_read_wait_block(uaddr, count); + + if (IS_ERR(vs)) + return PTR_ERR(vs); + + ret = futex_wait_multiple(vs, count, abs_time ? to : NULL); + kfree(vs); + + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + return ret; +} + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) @@ -180,6 +247,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } @@ -373,6 +443,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ diff --git a/kernel/kthread.c b/kernel/kthread.c index f97fd01a2932..7e6751b29101 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker); * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. + * + * Note that this function is not responsible for handling delayed work, so + * caller should be responsible for queuing or canceling all delayed work items + * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { @@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) kthread_flush_worker(worker); kthread_stop(task); + WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } diff --git a/kernel/padata.c b/kernel/padata.c index de90af5fcbe6..6bfae19c8a08 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -45,7 +45,7 @@ struct padata_mt_job_state { }; static void padata_free_pd(struct parallel_data *pd); -static void __init padata_mt_helper(struct work_struct *work); +static void padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -430,7 +430,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) return err; } -static void __init padata_mt_helper(struct work_struct *w) +static void padata_mt_helper(struct work_struct *w) { struct padata_work *pw = container_of(w, struct padata_work, pw_work); struct padata_mt_job_state *ps = pw->pw_data; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 69cb44b035ec..c8cfa467b6b8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -183,7 +183,6 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } - static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; @@ -323,14 +322,19 @@ static inline void *mte_safe_root(const struct maple_enode *node) return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } -static inline void mte_set_full(const struct maple_enode *node) +static inline void *mte_set_full(const struct maple_enode *node) +{ + return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); +} + +static inline void *mte_clear_full(const struct maple_enode *node) { - node = (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); + return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } -static inline void mte_clear_full(const struct maple_enode *node) +static inline bool mte_has_null(const struct maple_enode *node) { - node = (void *)((unsigned long)node | MAPLE_ENODE_NULL); + return (unsigned long)node & MAPLE_ENODE_NULL; } static inline bool ma_is_root(struct maple_node *node) @@ -432,7 +436,7 @@ enum maple_type mte_parent_enum(struct maple_enode *p_enode, return 0; /* Validated in the caller. */ p_type &= MAPLE_NODE_MASK; - p_type = p_type & ~(MAPLE_PARENT_ROOT | mte_parent_slot_mask(p_type)); + p_type = p_type & ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ @@ -463,7 +467,7 @@ static inline void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { - unsigned long val = (unsigned long) parent; + unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); @@ -497,10 +501,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, */ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { - unsigned long val = (unsigned long) mte_to_node(enode)->parent; + unsigned long val = (unsigned long)mte_to_node(enode)->parent; - /* Root. */ - if (val & 1) + if (val & MA_ROOT_PARENT) return 0; /* @@ -1273,8 +1276,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) mas->alloc->total = success; mas_set_err(mas, -ENOMEM); - return; - } /* @@ -1882,10 +1883,9 @@ static inline int mab_calc_split(struct ma_state *mas, /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); - if (!(*mid_split)) - return split; - *mid_split = mab_no_null_split(bn, *mid_split, slot_count); + if (unlikely(*mid_split)) + *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } @@ -2942,7 +2942,7 @@ static inline void *mtree_range_walk(struct ma_state *mas) mas->min = prev_min; mas->max = prev_max; mas->node = last; - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -3462,7 +3462,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, */ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) { - struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; @@ -3888,7 +3887,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) goto dead_node; } while (!ma_is_leaf(type)); - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -4706,15 +4705,11 @@ static inline void *mas_next_nentry(struct ma_state *mas, static inline void mas_rewalk(struct ma_state *mas, unsigned long index) { - retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; - - return; - } /* @@ -5615,7 +5610,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) mas_reset(wr_mas->mas); } } - } /* Interface */ @@ -6729,7 +6723,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; - else if (!node->slot[i] && max != mt_max[mte_node_type(entry)]) + else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; @@ -6836,7 +6830,7 @@ void mt_dump(const struct maple_tree *mt) if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0); else if (entry) - mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0); + mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); } EXPORT_SYMBOL_GPL(mt_dump); diff --git a/lib/string.c b/lib/string.c index 3371d26a0e39..2382658d393b 100644 --- a/lib/string.c +++ b/lib/string.c @@ -562,13 +562,11 @@ EXPORT_SYMBOL(strcspn); */ char *strpbrk(const char *cs, const char *ct) { - const char *sc1, *sc2; + const char *sc; - for (sc1 = cs; *sc1 != '\0'; ++sc1) { - for (sc2 = ct; *sc2 != '\0'; ++sc2) { - if (*sc1 == *sc2) - return (char *)sc1; - } + for (sc = cs; *sc != '\0'; ++sc) { + if (strchr(ct, *sc)) + return (char *)sc; } return NULL; } diff --git a/mm/compaction.c b/mm/compaction.c index d0b16a5b30f7..3613d7f174dc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -122,7 +122,6 @@ bool PageMovable(struct page *page) return false; } -EXPORT_SYMBOL(PageMovable); void __SetPageMovable(struct page *page, const struct movable_operations *mops) { @@ -1102,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation) + * fully scanned (e.g. dirty/writeback pages, parallel allocation) * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && - !cc->rescan && !cc->contended) { + !cc->finish_pageblock && !cc->contended) { ++low_pfn; break; } @@ -1172,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Updated the cached scanner pfn once the pageblock has been scanned + * Update the cached scanner pfn once the pageblock has been scanned. * Pages will either be migrated in which case there is no point * scanning in the near future or migration failed in which case the * failure reason may persist. The block is marked for skipping if * there were no pages isolated in the block or if the block is * rescanned twice in a row. */ - if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -1762,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->ignore_skip_hint) return pfn; + /* + * If the pageblock should be finished then do not select a different + * pageblock. + */ + if (cc->finish_pageblock) + return pfn; + /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous @@ -1839,7 +1845,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } @@ -2375,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long iteration_start_pfn = cc->migrate_pfn; /* - * Avoid multiple rescans which can happen if a page cannot be - * isolated (dirty/writeback in async mode) or if the migrated - * pages are being allocated before the pageblock is cleared. - * The first rescan will capture the entire pageblock for - * migration. If it fails, it'll be marked skip and scanning - * will proceed as normal. + * Avoid multiple rescans of the same pageblock which can + * happen if a page cannot be isolated (dirty/writeback in + * async mode) or if the migrated pages are being allocated + * before the pageblock is cleared. The first rescan will + * capture the entire pageblock for migration. If it fails, + * it'll be marked skip and scanning will proceed as normal. */ - cc->rescan = false; + cc->finish_pageblock = false; if (pageblock_start_pfn(last_migrated_pfn) == pageblock_start_pfn(iteration_start_pfn)) { - cc->rescan = true; + cc->finish_pageblock = true; } +rescan: switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; @@ -2430,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto out; } /* - * We failed to migrate at least one page in the current - * order-aligned block, so skip the rest of it. + * If an ASYNC or SYNC_LIGHT fails to migrate a page + * within the current order-aligned block, scan the + * remainder of the pageblock. This will mark the + * pageblock "skip" to avoid rescanning in the near + * future. This will isolate more pages than necessary + * for the request but avoid loops due to + * fast_find_migrateblock revisiting blocks that were + * recently partially scanned. */ - if (cc->direct_compaction && - (cc->mode == MIGRATE_ASYNC)) { - cc->migrate_pfn = block_end_pfn( - cc->migrate_pfn - 1, cc->order); - /* Draining pcplists is useless in this case */ - last_migrated_pfn = 0; + if (cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { + cc->finish_pageblock = true; + + /* + * Draining pcplists does not help THP if + * any page failed to migrate. Even after + * drain, the pageblock will not be free. + */ + if (cc->order == COMPACTION_HPAGE_ORDER) + last_migrated_pfn = 0; + + goto rescan; } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + check_drain: /* * Has the migration scanner moved away from the previous @@ -2460,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) last_migrated_pfn = 0; } } - - /* Stop if a page has been captured */ - if (capc && capc->page) { - ret = COMPACT_SUCCESS; - break; - } } out: diff --git a/mm/internal.h b/mm/internal.h index 6b7ef495b56d..fcb6c233e085 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -422,7 +422,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index ba863f46759d..96022973c9ba 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -645,7 +645,7 @@ static int __init memory_tier_init(void) * than default DRAM tier. */ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); - if (!default_dram_type) + if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); /* diff --git a/mm/z3fold.c b/mm/z3fold.c index cf71da10d04e..82d5fe16394b 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1463,7 +1463,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) struct z3fold_header *zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); if (test_bit(PAGE_HEADLESS, &page->private)) @@ -1503,7 +1502,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d03941cace2c..56f58a96b129 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1827,7 +1827,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); @@ -1859,7 +1858,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); /* The page is locked, so this pointer must remain valid */ @@ -1929,7 +1927,6 @@ static void zs_page_putback(struct page *page) { struct zspage *zspage; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 0edfdb40364b..ae52d3b3f063 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN $@ .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \ vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE - $(call if_changed,gen_initcalls_lds) + +$(call if_changed,gen_initcalls_lds) targets := .tmp_initcalls.lds diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 2c80da0220c3..c861beabc128 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -823,10 +823,10 @@ static void check_section(const char *modname, struct elf_info *elf, #define ALL_EXIT_SECTIONS EXIT_SECTIONS, ALL_XXXEXIT_SECTIONS #define DATA_SECTIONS ".data", ".data.rel" -#define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ +#define TEXT_SECTIONS ".text", ".text.*", ".sched.text", \ ".kprobes.text", ".cpuidle.text", ".noinstr.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ - ".fixup", ".entry.text", ".exception.text", ".text.*", \ + ".fixup", ".entry.text", ".exception.text", \ ".coldtext", ".softirqentry.text" #define INIT_SECTIONS ".init.*" diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index a5b10a6a33a5..7e6d983a0089 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1240,7 +1240,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd if (strncmp(hid, "CLSA0100", 8) == 0) { hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH; - } else if (strncmp(hid, "CLSA0101", 8) == 0) { + } else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) { hw_cfg->bst_type = CS35L41_EXT_BOOST; hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH; hw_cfg->gpio1.valid = true; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7e24b09b1163..ae64935581ff 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -374,6 +374,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) struct list_head *entry; struct rb_node *pnode; + INIT_LIST_HEAD(&sym->reloc_list); INIT_LIST_HEAD(&sym->pv_target); sym->alias = sym; @@ -570,6 +571,7 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, reloc->sym = sym; reloc->addend = addend; + list_add_tail(&reloc->sym_reloc_entry, &sym->reloc_list); list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); @@ -586,21 +588,10 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, */ static void elf_dirty_reloc_sym(struct elf *elf, struct symbol *sym) { - struct section *sec; - - list_for_each_entry(sec, &elf->sections, list) { - struct reloc *reloc; - - if (sec->changed) - continue; + struct reloc *reloc; - list_for_each_entry(reloc, &sec->reloc_list, list) { - if (reloc->sym == sym) { - sec->changed = true; - break; - } - } - } + list_for_each_entry(reloc, &sym->reloc_list, sym_reloc_entry) + reloc->sec->changed = true; } /* @@ -850,11 +841,12 @@ static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsi static int read_relocs(struct elf *elf) { + unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; struct section *sec; struct reloc *reloc; - int i; unsigned int symndx; - unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; + struct symbol *sym; + int i; if (!elf_alloc_hash(reloc, elf->text_size / 16)) return -1; @@ -895,13 +887,14 @@ static int read_relocs(struct elf *elf) reloc->sec = sec; reloc->idx = i; - reloc->sym = find_symbol_by_index(elf, symndx); + reloc->sym = sym = find_symbol_by_index(elf, symndx); if (!reloc->sym) { WARN("can't find reloc entry symbol %d for %s", symndx, sec->name); return -1; } + list_add_tail(&reloc->sym_reloc_entry, &sym->reloc_list); list_add_tail(&reloc->list, &sec->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 16f4067b82ae..2b893f8bc1bc 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -61,6 +61,7 @@ struct symbol { u8 fentry : 1; u8 profiling_func : 1; struct list_head pv_target; + struct list_head reloc_list; }; struct reloc { @@ -72,6 +73,7 @@ struct reloc { }; struct section *sec; struct symbol *sym; + struct list_head sym_reloc_entry; unsigned long offset; unsigned int type; s64 addend; -- 2.40.0 From 33b4cea92558c9f43d3b9ebcac92721c79c317b2 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 16 Dec 2022 14:24:11 +0100 Subject: [PATCH 07/17] fs-patches Signed-off-by: Peter Jung --- Documentation/filesystems/ntfs3.rst | 19 ++ fs/btrfs/block-group.c | 111 +++++++-- fs/btrfs/block-group.h | 6 + fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 14 ++ fs/btrfs/super.c | 2 + fs/btrfs/volumes.c | 3 + fs/btrfs/volumes.h | 2 + fs/ext4/namei.c | 7 + fs/iomap/buffered-io.c | 254 ++++++++++++++++++- fs/iomap/direct-io.c | 3 +- fs/iomap/iter.c | 19 +- fs/ntfs3/attrib.c | 372 +++++++++++++++++----------- fs/ntfs3/bitfunc.c | 4 +- fs/ntfs3/bitmap.c | 164 +++++++++--- fs/ntfs3/dir.c | 4 +- fs/ntfs3/file.c | 203 ++++----------- fs/ntfs3/frecord.c | 26 +- fs/ntfs3/fslog.c | 27 +- fs/ntfs3/fsntfs.c | 180 +++++++++++--- fs/ntfs3/index.c | 121 +++++++-- fs/ntfs3/inode.c | 201 ++++++++++++--- fs/ntfs3/namei.c | 238 ++++++++++++++++++ fs/ntfs3/ntfs.h | 6 +- fs/ntfs3/ntfs_fs.h | 41 ++- fs/ntfs3/record.c | 14 +- fs/ntfs3/run.c | 28 +-- fs/ntfs3/super.c | 132 ++++++---- fs/ntfs3/upcase.c | 12 + fs/ntfs3/xattr.c | 250 ++++++++++++++----- fs/xfs/libxfs/xfs_bmap.c | 8 +- fs/xfs/libxfs/xfs_btree.c | 7 +- fs/xfs/libxfs/xfs_btree.h | 1 - fs/xfs/libxfs/xfs_errortag.h | 18 +- fs/xfs/libxfs/xfs_refcount.c | 146 +++++++++-- fs/xfs/libxfs/xfs_sb.c | 4 +- fs/xfs/scrub/agheader.c | 47 ++-- fs/xfs/scrub/agheader_repair.c | 81 ++++-- fs/xfs/scrub/attr.c | 11 +- fs/xfs/scrub/bitmap.c | 11 +- fs/xfs/scrub/bmap.c | 147 ++++++++--- fs/xfs/scrub/btree.c | 14 +- fs/xfs/scrub/common.c | 48 ++-- fs/xfs/scrub/common.h | 2 +- fs/xfs/scrub/dabtree.c | 4 +- fs/xfs/scrub/dir.c | 10 +- fs/xfs/scrub/fscounters.c | 109 +++++++- fs/xfs/scrub/inode.c | 2 +- fs/xfs/scrub/quota.c | 8 +- fs/xfs/scrub/refcount.c | 12 +- fs/xfs/scrub/repair.c | 51 ++-- fs/xfs/scrub/scrub.c | 6 +- fs/xfs/scrub/scrub.h | 18 +- fs/xfs/scrub/symlink.c | 2 +- fs/xfs/xfs_aops.c | 32 ++- fs/xfs/xfs_bmap_util.c | 10 +- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_buf.c | 1 + fs/xfs/xfs_buf_item.c | 2 + fs/xfs/xfs_error.c | 46 +++- fs/xfs/xfs_error.h | 13 + fs/xfs/xfs_extent_busy.c | 1 + fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_fsmap.c | 4 +- fs/xfs/xfs_icache.c | 16 ++ fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_ioctl.c | 4 +- fs/xfs/xfs_iomap.c | 185 ++++++++------ fs/xfs/xfs_iomap.h | 6 +- fs/xfs/xfs_log.c | 46 ++-- fs/xfs/xfs_mount.c | 15 ++ fs/xfs/xfs_pnfs.c | 6 +- fs/xfs/xfs_qm.c | 16 +- fs/xfs/xfs_rtalloc.c | 60 ++++- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_trace.c | 2 + fs/xfs/xfs_trace.h | 86 +++++++ fs/xfs/xfs_trans_ail.c | 4 +- fs/xfs/xfs_xattr.c | 2 +- include/linux/iomap.h | 47 +++- 80 files changed, 2899 insertions(+), 944 deletions(-) diff --git a/Documentation/filesystems/ntfs3.rst b/Documentation/filesystems/ntfs3.rst index d67ccd22c63b..5aa102bd72c2 100644 --- a/Documentation/filesystems/ntfs3.rst +++ b/Documentation/filesystems/ntfs3.rst @@ -25,6 +25,11 @@ versions up to 3.1. File system type to use on mount is *ntfs3*. Note: Applied to empty files, this allows to switch type between sparse(0x200), compressed(0x800) and normal. + - *system.ntfs_attrib_be* gets/sets ntfs file/dir attributes. + + Same value as system.ntfs_attrib but always represent as big-endian + (endianness of system.ntfs_attrib is the same as of the CPU). + Mount Options ============= @@ -75,6 +80,20 @@ this table marked with no it means default is without **no**. - Files with the Windows-specific SYSTEM (FILE_ATTRIBUTE_SYSTEM) attribute will be marked as system immutable files. + * - hide_dot_files + - Updates the Windows-specific HIDDEN (FILE_ATTRIBUTE_HIDDEN) attribute + when creating and moving or renaming files. Files whose names start + with a dot will have the HIDDEN attribute set and files whose names + do not start with a dot will have it unset. + + * - windows_names + - Prevents the creation of files and directories with a name not allowed + by Windows, either because it contains some not allowed character (which + are the characters " * / : < > ? \\ | and those whose code is less than + 0x20), because the name (with or without extension) is a reserved file + name (CON, AUX, NUL, PRN, LPT1-9, COM1-9) or because the last character + is a space or a dot. Existing such files can still be read and renamed. + * - discard - Enable support of the TRIM command for improved performance on delete operations, which is recommended for use with the solid-state drives diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4b69945755e4..e4221323f344 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1527,6 +1527,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) return true; } +static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1594,6 +1618,40 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + + } + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } spin_unlock(&bg->lock); /* Get out fast, in case we're unmounting the filesystem */ @@ -2002,6 +2060,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); + cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); @@ -2693,6 +2752,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; + u64 old_commit_used; + u64 used; + + /* + * Block group items update can be triggered out of commit transaction + * critical section, thus we need a consistent view of used bytes. + * We cannot use cache->used directly outside of the spin lock, as it + * may be changed. + */ + spin_lock(&cache->lock); + old_commit_used = cache->commit_used; + used = cache->used; + /* No change in used bytes, can safely skip it. */ + if (cache->commit_used == used) { + spin_unlock(&cache->lock); + return 0; + } + cache->commit_used = used; + spin_unlock(&cache->lock); key.objectid = cache->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2707,7 +2785,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_used(&bgi, used); btrfs_set_stack_block_group_chunk_objectid(&bgi, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); @@ -2715,6 +2793,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); + /* We didn't update the block group item, need to revert @commit_used. */ + if (ret < 0) { + spin_lock(&cache->lock); + cache->commit_used = old_commit_used; + spin_unlock(&cache->lock); + } return ret; } @@ -3212,31 +3296,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } -static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, - u64 bytes_freed) -{ - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); - const u64 new_val = bg->used; - const u64 old_val = new_val + bytes_freed; - u64 thresh; - - if (reclaim_thresh == 0) - return false; - - thresh = div_factor_fine(bg->length, reclaim_thresh); - - /* - * If we were below the threshold before don't reclaim, we are likely a - * brand new block group and we don't want to relocate new block groups. - */ - if (old_val < thresh) - return false; - if (new_val >= thresh) - return false; - return true; -} - int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8fb14b99a1d1..e83eef195a89 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -99,6 +99,12 @@ struct btrfs_block_group { u64 cache_generation; u64 global_root_id; + /* + * The last committed used bytes of this block group, if the above @used + * is still the same as @commit_used, we don't need to update block + * group item of this block group. + */ + u64 commit_used; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a3febabacec0..b93088b761fa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1625,6 +1625,7 @@ enum { BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), }; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 883a3671a977..1fb141ddfb0f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3751,6 +3751,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); + btrfs_clear_opt(fs_info->mount_opt, NODISCARD); + } + /* * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index abfd7c897075..887f36a48330 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -918,12 +918,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, ret = -EINVAL; goto out; } + btrfs_clear_opt(info->mount_opt, NODISCARD); break; case Opt_nodiscard: btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); btrfs_clear_and_info(info, DISCARD_ASYNC, "turning off async discard"); + btrfs_set_opt(info->mount_opt, NODISCARD); break; case Opt_space_cache: case Opt_space_cache_version: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f02b8cbd6ec4..711ed164130e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -642,6 +642,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!bdev_nonrot(bdev)) fs_devices->rotating = true; + if (bdev_max_discard_sectors(bdev)) + fs_devices->discardable = true; + device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 099def5613b8..a20ee7d57831 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -354,6 +354,8 @@ struct btrfs_fs_devices { * nonrot flag set */ bool rotating; + /* Devices support TRIM/discard commands */ + bool discardable; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 800d631c920b..34606a08f0d6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3883,6 +3883,11 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } + /* + * We need to protect against old.inode directory getting + * converted from inline directory format into a normal one. + */ + inode_lock_nested(old.inode, I_MUTEX_NONDIR2); retval = ext4_rename_dir_prepare(handle, &old); if (retval) { inode_unlock(old.inode); @@ -4019,6 +4024,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } else { ext4_journal_stop(handle); } + if (old.dir_bh) + inode_unlock(old.inode); release_bh: brelse(old.dir_bh); brelse(old.bh); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 91ee0b308e13..356193e44cf0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); } -static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, +static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; @@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } + + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (page_ops && page_ops->iomap_valid) { + bool iomap_valid = page_ops->iomap_valid(iter->inode, + &iter->iomap); + if (!iomap_valid) { + iter->iomap.flags |= IOMAP_F_STALE; + status = 0; + goto out_unlock; + } + } + if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; @@ -773,6 +794,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + if (iter->iomap.flags & IOMAP_F_STALE) + break; page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(mapping)) @@ -832,6 +855,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * Scan the data range passed to us for dirty page cache folios. If we find a + * dirty folio, punch out the preceeding range and update the offset from which + * the next punch will start from. + * + * We can punch out storage reservations under clean pages because they either + * contain data that has been written back - in which case the delalloc punch + * over that range is a no-op - or they have been read faults in which case they + * contain zeroes and we can remove the delalloc backing range and any new + * writes to those pages will do the normal hole filling operation... + * + * This makes the logic simple: we only need to keep the delalloc extents only + * over the dirty ranges of the page cache. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + */ +static int iomap_write_delalloc_scan(struct inode *inode, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t offset, loff_t length)) +{ + while (start_byte < end_byte) { + struct folio *folio; + + /* grab locked page */ + folio = filemap_lock_folio(inode->i_mapping, + start_byte >> PAGE_SHIFT); + if (!folio) { + start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + + PAGE_SIZE; + continue; + } + + /* if dirty, punch up to offset */ + if (folio_test_dirty(folio)) { + if (start_byte > *punch_start_byte) { + int error; + + error = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (error) { + folio_unlock(folio); + folio_put(folio); + return error; + } + } + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_next_index(folio) << PAGE_SHIFT); + } + + /* move offset to start of next folio in range */ + start_byte = folio_next_index(folio) << PAGE_SHIFT; + folio_unlock(folio); + folio_put(folio); + } + return 0; +} + +/* + * Punch out all the delalloc blocks in the range given except for those that + * have dirty data still pending in the page cache - those are going to be + * written and so must still retain the delalloc backing for writeback. + * + * As we are scanning the page cache for data, we don't need to reimplement the + * wheel - mapping_seek_hole_data() does exactly what we need to identify the + * start and end of data ranges correctly even for sub-folio block sizes. This + * byte range based iteration is especially convenient because it means we + * don't have to care about variable size folios, nor where the start or end of + * the data range lies within a folio, if they lie within the same folio or even + * if there are multiple discontiguous data ranges within the folio. + * + * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so + * can return data ranges that exist in the cache beyond EOF. e.g. a page fault + * spanning EOF will initialise the post-EOF data to zeroes and mark it up to + * date. A write page fault can then mark it dirty. If we then fail a write() + * beyond EOF into that up to date cached range, we allocate a delalloc block + * beyond EOF and then have to punch it out. Because the range is up to date, + * mapping_seek_hole_data() will return it, and we will skip the punch because + * the folio is dirty. THis is incorrect - we always need to punch out delalloc + * beyond EOF in this case as writeback will never write back and covert that + * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, + * resulting in always punching out the range from the EOF to the end of the + * range the iomap spans. + * + * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it + * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA + * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) + * returns the end of the data range (data_end). Using closed intervals would + * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose + * the code to subtle off-by-one bugs.... + */ +static int iomap_write_delalloc_release(struct inode *inode, + loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t punch_start_byte = start_byte; + loff_t scan_end_byte = min(i_size_read(inode), end_byte); + int error = 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite whilst we walk the + * cache and perform delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + while (start_byte < scan_end_byte) { + loff_t data_end; + + start_byte = mapping_seek_hole_data(inode->i_mapping, + start_byte, scan_end_byte, SEEK_DATA); + /* + * If there is no more data to scan, all that is left is to + * punch out the remaining range. + */ + if (start_byte == -ENXIO || start_byte == scan_end_byte) + break; + if (start_byte < 0) { + error = start_byte; + goto out_unlock; + } + WARN_ON_ONCE(start_byte < punch_start_byte); + WARN_ON_ONCE(start_byte > scan_end_byte); + + /* + * We find the end of this contiguous cached data range by + * seeking from start_byte to the beginning of the next hole. + */ + data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, + scan_end_byte, SEEK_HOLE); + if (data_end < 0) { + error = data_end; + goto out_unlock; + } + WARN_ON_ONCE(data_end <= start_byte); + WARN_ON_ONCE(data_end > scan_end_byte); + + error = iomap_write_delalloc_scan(inode, &punch_start_byte, + start_byte, data_end, punch); + if (error) + goto out_unlock; + + /* The next data search starts at the end of this one. */ + start_byte = data_end; + } + + if (punch_start_byte < end_byte) + error = punch(inode, punch_start_byte, + end_byte - punch_start_byte); +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + +/* + * When a short write occurs, the filesystem may need to remove reserved space + * that was allocated in ->iomap_begin from it's ->iomap_end method. For + * filesystems that use delayed allocation, we need to punch out delalloc + * extents from the range that are not dirty in the page cache. As the write can + * race with page faults, there can be dirty pages over the delalloc extent + * outside the range of a short write but still within the delalloc extent + * allocated for this iomap. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + */ +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, + ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t start_byte; + loff_t end_byte; + int blocksize = i_blocksize(inode); + + if (iomap->type != IOMAP_DELALLOC) + return 0; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* + * start_byte refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_byte = round_down(pos, blocksize); + else + start_byte = round_up(pos + written, blocksize); + end_byte = round_up(pos + length, blocksize); + + /* Nothing to do if we've written the entire delalloc extent */ + if (start_byte >= end_byte) + return 0; + + return iomap_write_delalloc_release(inode, start_byte, end_byte, + punch); +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); + static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; @@ -856,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) @@ -911,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4eb559a16c9e..9804714b1751 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -240,7 +240,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; - unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; loff_t length = iomap_length(iter); loff_t pos = iter->pos; @@ -252,7 +251,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; - if ((pos | length) & ((1 << blkbits) - 1) || + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index a1c7592d2ade..79a0614eaab7 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,12 +7,28 @@ #include #include "trace.h" +/* + * Advance to the next range we need to map. + * + * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully + * processed - it was aborted because the extent the iomap spanned may have been + * changed during the operation. In this case, the iteration behaviour is to + * remap the unprocessed range of the iter, and that means we may need to remap + * even when we've made no progress (i.e. iter->processed = 0). Hence the + * "finished iterating" case needs to distinguish between + * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we + * need to remap the entire remaining range. + */ static inline int iomap_iter_advance(struct iomap_iter *iter) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + /* handle the previous iteration (if any) */ if (iter->iomap.length) { - if (iter->processed <= 0) + if (iter->processed < 0) return iter->processed; + if (!iter->processed && !stale) + return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; @@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 63169529b52c..5e6bafb10f42 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -54,33 +54,6 @@ static inline u64 get_pre_allocated(u64 size) return ret; } -/* - * attr_must_be_resident - * - * Return: True if attribute must be resident. - */ -static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi, - enum ATTR_TYPE type) -{ - const struct ATTR_DEF_ENTRY *de; - - switch (type) { - case ATTR_STD: - case ATTR_NAME: - case ATTR_ID: - case ATTR_LABEL: - case ATTR_VOL_INFO: - case ATTR_ROOT: - case ATTR_EA_INFO: - return true; - default: - de = ntfs_query_def(sbi, type); - if (de && (de->flags & NTFS_ATTR_MUST_BE_RESIDENT)) - return true; - return false; - } -} - /* * attr_load_runs - Load all runs stored in @attr. */ @@ -176,7 +149,7 @@ static int run_deallocate_ex(struct ntfs_sb_info *sbi, struct runs_tree *run, int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, - CLST *new_lcn) + CLST *new_lcn, CLST *new_len) { int err; CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0; @@ -196,20 +169,36 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, if (err) goto out; - if (new_lcn && vcn == vcn0) - *new_lcn = lcn; + if (vcn == vcn0) { + /* Return the first fragment. */ + if (new_lcn) + *new_lcn = lcn; + if (new_len) + *new_len = flen; + } /* Add new fragment into run storage. */ - if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) { + if (!run_add_entry(run, vcn, lcn, flen, opt & ALLOCATE_MFT)) { /* Undo last 'ntfs_look_for_free_space' */ mark_as_free_ex(sbi, lcn, len, false); err = -ENOMEM; goto out; } + if (opt & ALLOCATE_ZERO) { + u8 shift = sbi->cluster_bits - SECTOR_SHIFT; + + err = blkdev_issue_zeroout(sbi->sb->s_bdev, + (sector_t)lcn << shift, + (sector_t)flen << shift, + GFP_NOFS, 0); + if (err) + goto out; + } + vcn += flen; - if (flen >= len || opt == ALLOCATE_MFT || + if (flen >= len || (opt & ALLOCATE_MFT) || (fr && run->count - cnt >= fr)) { *alen = vcn - vcn0; return 0; @@ -284,7 +273,8 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, const char *data = resident_data(attr); err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL, - ALLOCATE_DEF, &alen, 0, NULL); + ALLOCATE_DEF, &alen, 0, NULL, + NULL); if (err) goto out1; @@ -424,6 +414,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn; CLST next_svcn, pre_alloc = -1, done = 0; bool is_ext, is_bad = false; + bool dirty = false; u32 align; struct MFT_REC *rec; @@ -444,8 +435,10 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, return err; /* Return if file is still resident. */ - if (!attr_b->non_res) + if (!attr_b->non_res) { + dirty = true; goto ok1; + } /* Layout of records may be changed, so do a full search. */ goto again; @@ -468,7 +461,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); - mi_b->dirty = true; + mi_b->dirty = dirty = true; goto ok; } @@ -514,7 +507,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, if (new_alloc <= old_alloc) { attr_b->nres.data_size = cpu_to_le64(new_size); - mi_b->dirty = true; + mi_b->dirty = dirty = true; goto ok; } @@ -579,13 +572,13 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, /* ~3 bytes per fragment. */ err = attr_allocate_clusters( sbi, run, vcn, lcn, to_allocate, &pre_alloc, - is_mft ? ALLOCATE_MFT : 0, &alen, + is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen, is_mft ? 0 : (sbi->record_size - le32_to_cpu(rec->used) + 8) / 3 + 1, - NULL); + NULL, NULL); if (err) goto out; } @@ -605,7 +598,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, next_svcn = le64_to_cpu(attr->nres.evcn) + 1; new_alloc_tmp = (u64)next_svcn << cluster_bits; attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); - mi_b->dirty = true; + mi_b->dirty = dirty = true; if (next_svcn >= vcn && !to_allocate) { /* Normal way. Update attribute and exit. */ @@ -691,7 +684,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, old_valid = old_size = old_alloc = (u64)vcn << cluster_bits; attr_b->nres.valid_size = attr_b->nres.data_size = attr_b->nres.alloc_size = cpu_to_le64(old_size); - mi_b->dirty = true; + mi_b->dirty = dirty = true; goto again_1; } @@ -753,7 +746,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, attr_b->nres.valid_size = attr_b->nres.alloc_size; } - mi_b->dirty = true; + mi_b->dirty = dirty = true; err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen, true); @@ -814,16 +807,9 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, if (ret) *ret = attr_b; - /* Update inode_set_bytes. */ if (((type == ATTR_DATA && !name_len) || (type == ATTR_ALLOC && name == I30_NAME))) { - bool dirty = false; - - if (ni->vfs_inode.i_size != new_size) { - ni->vfs_inode.i_size = new_size; - dirty = true; - } - + /* Update inode_set_bytes. */ if (attr_b->non_res) { new_alloc = le64_to_cpu(attr_b->nres.alloc_size); if (inode_get_bytes(&ni->vfs_inode) != new_alloc) { @@ -832,6 +818,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, } } + /* Don't forget to update duplicate information in parent. */ if (dirty) { ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); @@ -882,8 +869,19 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, return err; } +/* + * attr_data_get_block - Returns 'lcn' and 'len' for given 'vcn'. + * + * @new == NULL means just to get current mapping for 'vcn' + * @new != NULL means allocate real cluster if 'vcn' maps to hole + * @zero - zeroout new allocated clusters + * + * NOTE: + * - @new != NULL is called only for sparsed or compressed attributes. + * - new allocated clusters are zeroed via blkdev_issue_zeroout. + */ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, - CLST *len, bool *new) + CLST *len, bool *new, bool zero) { int err = 0; struct runs_tree *run = &ni->file.run; @@ -892,29 +890,29 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; - CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end; - u64 total_size; - u32 clst_per_frame; - bool ok; + CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; + CLST alloc, evcn; + unsigned fr; + u64 total_size, total_size0; + int step = 0; if (new) *new = false; + /* Try to find in cache. */ down_read(&ni->file.run_lock); - ok = run_lookup_entry(run, vcn, lcn, len, NULL); + if (!run_lookup_entry(run, vcn, lcn, len, NULL)) + *len = 0; up_read(&ni->file.run_lock); - if (ok && (*lcn != SPARSE_LCN || !new)) { - /* Normal way. */ - return 0; + if (*len) { + if (*lcn != SPARSE_LCN || !new) + return 0; /* Fast normal way without allocation. */ + else if (clen > *len) + clen = *len; } - if (!clen) - clen = 1; - - if (ok && clen > *len) - clen = *len; - + /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; cluster_bits = sbi->cluster_bits; @@ -936,16 +934,15 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits; if (vcn >= asize) { - err = -EINVAL; + if (new) { + err = -EINVAL; + } else { + *len = 1; + *lcn = SPARSE_LCN; + } goto out; } - clst_per_frame = 1u << attr_b->nres.c_unit; - to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1); - - if (vcn + to_alloc > asize) - to_alloc = asize - vcn; - svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; @@ -964,36 +961,68 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } + /* Load in cache actual information. */ err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; - if (!ok) { - ok = run_lookup_entry(run, vcn, lcn, len, NULL); - if (ok && (*lcn != SPARSE_LCN || !new)) { - /* Normal way. */ - err = 0; - goto ok; - } + if (!*len) { + if (run_lookup_entry(run, vcn, lcn, len, NULL)) { + if (*lcn != SPARSE_LCN || !new) + goto ok; /* Slow normal way without allocation. */ - if (!ok && !new) { - *len = 0; - err = 0; + if (clen > *len) + clen = *len; + } else if (!new) { + /* Here we may return -ENOENT. + * In any case caller gets zero length. */ goto ok; } - - if (ok && clen > *len) { - clen = *len; - to_alloc = (clen + clst_per_frame - 1) & - ~(clst_per_frame - 1); - } } if (!is_attr_ext(attr_b)) { + /* The code below only for sparsed or compressed attributes. */ err = -EINVAL; goto out; } + vcn0 = vcn; + to_alloc = clen; + fr = (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1; + /* Allocate frame aligned clusters. + * ntfs.sys usually uses 16 clusters per frame for sparsed or compressed. + * ntfs3 uses 1 cluster per frame for new created sparsed files. */ + if (attr_b->nres.c_unit) { + CLST clst_per_frame = 1u << attr_b->nres.c_unit; + CLST cmask = ~(clst_per_frame - 1); + + /* Get frame aligned vcn and to_alloc. */ + vcn = vcn0 & cmask; + to_alloc = ((vcn0 + clen + clst_per_frame - 1) & cmask) - vcn; + if (fr < clst_per_frame) + fr = clst_per_frame; + zero = true; + + /* Check if 'vcn' and 'vcn0' in different attribute segments. */ + if (vcn < svcn || evcn1 <= vcn) { + /* Load attribute for truncated vcn. */ + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, + &vcn, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + } + } + + if (vcn + to_alloc > asize) + to_alloc = asize - vcn; + /* Get the last LCN to allocate from. */ hint = 0; @@ -1007,18 +1036,35 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, hint = -1; } - err = attr_allocate_clusters( - sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len, - (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1, - lcn); + /* Allocate and zeroout new clusters. */ + err = attr_allocate_clusters(sbi, run, vcn, hint + 1, to_alloc, NULL, + zero ? ALLOCATE_ZERO : ALLOCATE_DEF, &alen, + fr, lcn, len); if (err) goto out; *new = true; + step = 1; - end = vcn + *len; + end = vcn + alen; + /* Save 'total_size0' to restore if error. */ + total_size0 = le64_to_cpu(attr_b->nres.total_size); + total_size = total_size0 + ((u64)alen << cluster_bits); - total_size = le64_to_cpu(attr_b->nres.total_size) + - ((u64)*len << cluster_bits); + if (vcn != vcn0) { + if (!run_lookup_entry(run, vcn0, lcn, len, NULL)) { + err = -EINVAL; + goto out; + } + if (*lcn == SPARSE_LCN) { + /* Internal error. Should not happened. */ + WARN_ON(1); + err = -EINVAL; + goto out; + } + /* Check case when vcn0 + len overlaps new allocated clusters. */ + if (vcn0 + *len > end) + *len = end - vcn0; + } repack: err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); @@ -1044,7 +1090,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, if (!ni->attr_list.size) { err = ni_create_attr_list(ni); if (err) - goto out; + goto undo1; /* Layout of records is changed. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, @@ -1061,67 +1107,83 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, } } + /* + * The code below may require additional cluster (to extend attribute list) + * and / or one MFT record + * It is too complex to undo operations if -ENOSPC occurs deep inside + * in 'ni_insert_nonresident'. + * Return in advance -ENOSPC here if there are no free cluster and no free MFT. + */ + if (!ntfs_check_for_free_space(sbi, 1, 1)) { + /* Undo step 1. */ + err = -ENOSPC; + goto undo1; + } + + step = 2; svcn = evcn1; /* Estimate next attribute. */ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); - if (attr) { - CLST alloc = bytes_to_cluster( - sbi, le64_to_cpu(attr_b->nres.alloc_size)); - CLST evcn = le64_to_cpu(attr->nres.evcn); - - if (end < next_svcn) - end = next_svcn; - while (end > evcn) { - /* Remove segment [svcn : evcn). */ - mi_remove_attr(NULL, mi, attr); - - if (!al_remove_le(ni, le)) { - err = -EINVAL; - goto out; - } + if (!attr) { + /* Insert new attribute segment. */ + goto ins_ext; + } - if (evcn + 1 >= alloc) { - /* Last attribute segment. */ - evcn1 = evcn + 1; - goto ins_ext; - } + /* Try to update existed attribute segment. */ + alloc = bytes_to_cluster(sbi, le64_to_cpu(attr_b->nres.alloc_size)); + evcn = le64_to_cpu(attr->nres.evcn); - if (ni_load_mi(ni, le, &mi)) { - attr = NULL; - goto out; - } + if (end < next_svcn) + end = next_svcn; + while (end > evcn) { + /* Remove segment [svcn : evcn). */ + mi_remove_attr(NULL, mi, attr); - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, - &le->id); - if (!attr) { - err = -EINVAL; - goto out; - } - svcn = le64_to_cpu(attr->nres.svcn); - evcn = le64_to_cpu(attr->nres.evcn); + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; } - if (end < svcn) - end = svcn; + if (evcn + 1 >= alloc) { + /* Last attribute segment. */ + evcn1 = evcn + 1; + goto ins_ext; + } - err = attr_load_runs(attr, ni, run, &end); - if (err) + if (ni_load_mi(ni, le, &mi)) { + attr = NULL; goto out; + } - evcn1 = evcn + 1; - attr->nres.svcn = cpu_to_le64(next_svcn); - err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); - if (err) + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id); + if (!attr) { + err = -EINVAL; goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } - le->vcn = cpu_to_le64(next_svcn); - ni->attr_list.dirty = true; - mi->dirty = true; + if (end < svcn) + end = svcn; + + err = attr_load_runs(attr, ni, run, &end); + if (err) + goto out; + + evcn1 = evcn + 1; + attr->nres.svcn = cpu_to_le64(next_svcn); + err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); + if (err) + goto out; + + le->vcn = cpu_to_le64(next_svcn); + ni->attr_list.dirty = true; + mi->dirty = true; + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; - next_svcn = le64_to_cpu(attr->nres.evcn) + 1; - } ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, @@ -1133,10 +1195,26 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, ok: run_truncate_around(run, vcn); out: + if (err && step > 1) { + /* Too complex to restore. */ + _ntfs_bad_inode(&ni->vfs_inode); + } up_write(&ni->file.run_lock); ni_unlock(ni); return err; + +undo1: + /* Undo step1. */ + attr_b->nres.total_size = cpu_to_le64(total_size0); + inode_set_bytes(&ni->vfs_inode, total_size0); + + if (run_deallocate_ex(sbi, run, vcn, alen, NULL, false) || + !run_add_entry(run, vcn, SPARSE_LCN, alen, false) || + mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn)) { + _ntfs_bad_inode(&ni->vfs_inode); + } + goto out; } int attr_data_read_resident(struct ntfs_inode *ni, struct page *page) @@ -1543,7 +1621,7 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; - CLST svcn, evcn1, next_svcn, lcn, len; + CLST svcn, evcn1, next_svcn, len; CLST vcn, end, clst_data; u64 total_size, valid_size, data_size; @@ -1619,8 +1697,9 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, } err = attr_allocate_clusters(sbi, run, vcn + clst_data, - hint + 1, len - clst_data, NULL, 0, - &alen, 0, &lcn); + hint + 1, len - clst_data, NULL, + ALLOCATE_DEF, &alen, 0, NULL, + NULL); if (err) goto out; @@ -2308,7 +2387,8 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) if (!attr_b->non_res) { /* Still resident. */ - char *data = Add2Ptr(attr_b, attr_b->res.data_off); + char *data = Add2Ptr(attr_b, + le16_to_cpu(attr_b->res.data_off)); memmove(data + bytes, data, bytes); memset(data, 0, bytes); @@ -2400,8 +2480,8 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) if (vbo <= ni->i_valid) ni->i_valid += bytes; - attr_b->nres.data_size = le64_to_cpu(data_size + bytes); - attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes); + attr_b->nres.data_size = cpu_to_le64(data_size + bytes); + attr_b->nres.alloc_size = cpu_to_le64(alloc_size + bytes); /* ni->valid may be not equal valid_size (temporary). */ if (ni->i_valid > data_size + bytes) diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c index 50d838093790..25a4d4896aa9 100644 --- a/fs/ntfs3/bitfunc.c +++ b/fs/ntfs3/bitfunc.c @@ -30,7 +30,7 @@ static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, * * Return: True if all bits [bit, bit+nbits) are zeros "0". */ -bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits) +bool are_bits_clear(const void *lmap, size_t bit, size_t nbits) { size_t pos = bit & 7; const u8 *map = (u8 *)lmap + (bit >> 3); @@ -78,7 +78,7 @@ bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits) * * Return: True if all bits [bit, bit+nbits) are ones "1". */ -bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) +bool are_bits_set(const void *lmap, size_t bit, size_t nbits) { u8 mask; size_t pos = bit & 7; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 45f95c1cb258..723fb64e6531 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -59,14 +59,14 @@ void ntfs3_exit_bitmap(void) * * Return: -1 if not found. */ -static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend, +static size_t wnd_scan(const void *buf, size_t wbit, u32 wpos, u32 wend, size_t to_alloc, size_t *prev_tail, size_t *b_pos, size_t *b_len) { while (wpos < wend) { size_t free_len; u32 free_bits, end; - u32 used = find_next_zero_bit(buf, wend, wpos); + u32 used = find_next_zero_bit_le(buf, wend, wpos); if (used >= wend) { if (*b_len < *prev_tail) { @@ -92,7 +92,7 @@ static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend, * Now we have a fragment [wpos, wend) staring with 0. */ end = wpos + to_alloc - *prev_tail; - free_bits = find_next_bit(buf, min(end, wend), wpos); + free_bits = find_next_bit_le(buf, min(end, wend), wpos); free_len = *prev_tail + free_bits - wpos; @@ -504,7 +504,6 @@ static int wnd_rescan(struct wnd_bitmap *wnd) u8 cluster_bits = sbi->cluster_bits; u32 wbits = 8 * sb->s_blocksize; u32 used, frb; - const ulong *buf; size_t wpos, wbit, iw, vbo; struct buffer_head *bh = NULL; CLST lcn, clen; @@ -558,9 +557,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd) goto out; } - buf = (ulong *)bh->b_data; - - used = bitmap_weight(buf, wbits); + used = ntfs_bitmap_weight_le(bh->b_data, wbits); if (used < wbits) { frb = wbits - used; wnd->free_bits[iw] = frb; @@ -574,7 +571,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd) wbits = wnd->nbits - wbit; do { - used = find_next_zero_bit(buf, wbits, wpos); + used = find_next_zero_bit_le(bh->b_data, wbits, wpos); if (used > wpos && prev_tail) { wnd_add_free_ext(wnd, wbit + wpos - prev_tail, @@ -590,7 +587,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd) break; } - frb = find_next_bit(buf, wbits, wpos); + frb = find_next_bit_le(bh->b_data, wbits, wpos); if (frb >= wbits) { /* Keep last free block. */ prev_tail += frb - wpos; @@ -718,7 +715,6 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) while (iw < wnd->nwnd && bits) { u32 tail, op; - ulong *buf; if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; @@ -732,11 +728,9 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) break; } - buf = (ulong *)bh->b_data; - lock_buffer(bh); - __bitmap_clear(buf, wbit, op); + ntfs_bitmap_clear_le(bh->b_data, wbit, op); wnd->free_bits[iw] += op; @@ -771,7 +765,6 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) while (iw < wnd->nwnd && bits) { u32 tail, op; - ulong *buf; if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; @@ -784,11 +777,10 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) err = PTR_ERR(bh); break; } - buf = (ulong *)bh->b_data; lock_buffer(bh); - __bitmap_set(buf, wbit, op); + ntfs_bitmap_set_le(bh->b_data, wbit, op); wnd->free_bits[iw] -= op; set_buffer_uptodate(bh); @@ -808,6 +800,44 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) return err; } +/* + * wnd_set_used_safe - Mark the bits range from bit to bit + bits as used. + * + * Unlikely wnd_set_used/wnd_set_free this function is not full trusted. + * It scans every bit in bitmap and marks free bit as used. + * @done - how many bits were marked as used. + * + * NOTE: normally *done should be 0. + */ +int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, + size_t *done) +{ + size_t i, from = 0, len = 0; + int err = 0; + + *done = 0; + for (i = 0; i < bits; i++) { + if (wnd_is_free(wnd, bit + i, 1)) { + if (!len) + from = bit + i; + len += 1; + } else if (len) { + err = wnd_set_used(wnd, from, len); + *done += len; + len = 0; + if (err) + break; + } + } + + if (len) { + /* last fragment. */ + err = wnd_set_used(wnd, from, len); + *done += len; + } + return err; +} + /* * wnd_is_free_hlp * @@ -836,7 +866,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) if (IS_ERR(bh)) return false; - ret = are_bits_clear((ulong *)bh->b_data, wbit, op); + ret = are_bits_clear(bh->b_data, wbit, op); put_bh(bh); if (!ret) @@ -928,7 +958,7 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) if (IS_ERR(bh)) goto out; - ret = are_bits_set((ulong *)bh->b_data, wbit, op); + ret = are_bits_set(bh->b_data, wbit, op); put_bh(bh); if (!ret) goto out; @@ -959,7 +989,6 @@ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, size_t fnd, max_alloc, b_len, b_pos; size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend; size_t to_alloc0 = to_alloc; - const ulong *buf; const struct e_node *e; const struct rb_node *pr, *cr; u8 log2_bits; @@ -1185,14 +1214,13 @@ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, continue; } - buf = (ulong *)bh->b_data; - /* Scan range [wbit, zbit). */ if (wpos < wzbit) { /* Scan range [wpos, zbit). */ - fnd = wnd_scan(buf, wbit, wpos, wzbit, - to_alloc, &prev_tail, - &b_pos, &b_len); + fnd = wnd_scan(bh->b_data, wbit, wpos, + wzbit, to_alloc, + &prev_tail, &b_pos, + &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; @@ -1203,7 +1231,7 @@ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, /* Scan range [zend, ebit). */ if (wzend < wbits) { - fnd = wnd_scan(buf, wbit, + fnd = wnd_scan(bh->b_data, wbit, max(wzend, wpos), wbits, to_alloc, &prev_tail, &b_pos, &b_len); @@ -1242,11 +1270,9 @@ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, continue; } - buf = (ulong *)bh->b_data; - /* Scan range [wpos, eBits). */ - fnd = wnd_scan(buf, wbit, wpos, wbits, to_alloc, &prev_tail, - &b_pos, &b_len); + fnd = wnd_scan(bh->b_data, wbit, wpos, wbits, to_alloc, + &prev_tail, &b_pos, &b_len); put_bh(bh); if (fnd != MINUS_ONE_T) goto found; @@ -1324,7 +1350,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) new_last = wbits; if (new_wnd != wnd->nwnd) { - new_free = kmalloc(new_wnd * sizeof(u16), GFP_NOFS); + new_free = kmalloc_array(new_wnd, sizeof(u16), GFP_NOFS); if (!new_free) return -ENOMEM; @@ -1344,7 +1370,6 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) size_t frb; u64 vbo, lbo, bytes; struct buffer_head *bh; - ulong *buf; if (iw + 1 == new_wnd) wbits = new_last; @@ -1361,10 +1386,9 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) return -EIO; lock_buffer(bh); - buf = (ulong *)bh->b_data; - __bitmap_clear(buf, b0, blocksize * 8 - b0); - frb = wbits - bitmap_weight(buf, wbits); + ntfs_bitmap_clear_le(bh->b_data, b0, blocksize * 8 - b0); + frb = wbits - ntfs_bitmap_weight_le(bh->b_data, wbits); wnd->total_zeroes += frb - wnd->free_bits[iw]; wnd->free_bits[iw] = frb; @@ -1411,7 +1435,6 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) CLST lcn_from = bytes_to_cluster(sbi, range->start); size_t iw = lcn_from >> (sb->s_blocksize_bits + 3); u32 wbit = lcn_from & (wbits - 1); - const ulong *buf; CLST lcn_to; if (!minlen) @@ -1446,10 +1469,8 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) break; } - buf = (ulong *)bh->b_data; - for (; wbit < wbits; wbit++) { - if (!test_bit(wbit, buf)) { + if (!test_bit_le(wbit, bh->b_data)) { if (!len) lcn = lcn_wnd + wbit; len += 1; @@ -1481,3 +1502,70 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) return err; } + +#if BITS_PER_LONG == 64 +typedef __le64 bitmap_ulong; +#define cpu_to_ul(x) cpu_to_le64(x) +#define ul_to_cpu(x) le64_to_cpu(x) +#else +typedef __le32 bitmap_ulong; +#define cpu_to_ul(x) cpu_to_le32(x) +#define ul_to_cpu(x) le32_to_cpu(x) +#endif + +void ntfs_bitmap_set_le(void *map, unsigned int start, int len) +{ + bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + bitmap_ulong mask_to_set = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = cpu_to_ul(~0UL); + p++; + } + if (len) { + mask_to_set &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); + *p |= mask_to_set; + } +} + +void ntfs_bitmap_clear_le(void *map, unsigned int start, int len) +{ + bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + bitmap_ulong mask_to_clear = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = cpu_to_ul(~0UL); + p++; + } + if (len) { + mask_to_clear &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); + *p &= ~mask_to_clear; + } +} + +unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits) +{ + const ulong *bmp = bitmap; + unsigned int k, lim = bits / BITS_PER_LONG; + unsigned int w = 0; + + for (k = 0; k < lim; k++) + w += hweight_long(bmp[k]); + + if (bits % BITS_PER_LONG) { + w += hweight_long(ul_to_cpu(((bitmap_ulong *)bitmap)[k]) & + BITMAP_LAST_WORD_MASK(bits)); + } + + return w; +} diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index fb438d604040..063a6654199b 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -26,8 +26,8 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, if (!nls) { /* UTF-16 -> UTF-8 */ - ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf, - buf_len); + ret = utf16s_to_utf8s((wchar_t *)name, len, UTF16_LITTLE_ENDIAN, + buf, buf_len); buf[ret] = '\0'; return ret; } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index f31c0389a2e7..e41b8d54e175 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -122,31 +122,15 @@ static int ntfs_extend_initialized_size(struct file *file, bits = sbi->cluster_bits; vcn = pos >> bits; - err = attr_data_get_block(ni, vcn, 0, &lcn, &clen, - NULL); + err = attr_data_get_block(ni, vcn, 1, &lcn, &clen, NULL, + false); if (err) goto out; if (lcn == SPARSE_LCN) { - loff_t vbo = (loff_t)vcn << bits; - loff_t to = vbo + ((loff_t)clen << bits); - - if (to <= new_valid) { - ni->i_valid = to; - pos = to; - goto next; - } - - if (vbo < pos) { - pos = vbo; - } else { - to = (new_valid >> bits) << bits; - if (pos < to) { - ni->i_valid = to; - pos = to; - goto next; - } - } + pos = ((loff_t)clen + vcn) << bits; + ni->i_valid = pos; + goto next; } } @@ -196,18 +180,18 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) struct address_space *mapping = inode->i_mapping; u32 blocksize = 1 << inode->i_blkbits; pgoff_t idx = vbo >> PAGE_SHIFT; - u32 z_start = vbo & (PAGE_SIZE - 1); + u32 from = vbo & (PAGE_SIZE - 1); pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT; loff_t page_off; struct buffer_head *head, *bh; - u32 bh_next, bh_off, z_end; + u32 bh_next, bh_off, to; sector_t iblock; struct page *page; - for (; idx < idx_end; idx += 1, z_start = 0) { + for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; - z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) - : PAGE_SIZE; + to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) + : PAGE_SIZE; iblock = page_off >> inode->i_blkbits; page = find_or_create_page(mapping, idx, @@ -224,7 +208,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) do { bh_next = bh_off + blocksize; - if (bh_next <= z_start || bh_off >= z_end) + if (bh_next <= from || bh_off >= to) continue; if (!buffer_mapped(bh)) { @@ -258,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); - zero_user_segment(page, z_start, z_end); + zero_user_segment(page, from, to); unlock_page(page); put_page(page); @@ -269,81 +253,6 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) return err; } -/* - * ntfs_sparse_cluster - Helper function to zero a new allocated clusters. - * - * NOTE: 512 <= cluster size <= 2M - */ -void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, - CLST len) -{ - struct address_space *mapping = inode->i_mapping; - struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; - u64 vbo = (u64)vcn << sbi->cluster_bits; - u64 bytes = (u64)len << sbi->cluster_bits; - u32 blocksize = 1 << inode->i_blkbits; - pgoff_t idx0 = page0 ? page0->index : -1; - loff_t vbo_clst = vbo & sbi->cluster_mask_inv; - loff_t end = ntfs_up_cluster(sbi, vbo + bytes); - pgoff_t idx = vbo_clst >> PAGE_SHIFT; - u32 from = vbo_clst & (PAGE_SIZE - 1); - pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - loff_t page_off; - u32 to; - bool partial; - struct page *page; - - for (; idx < idx_end; idx += 1, from = 0) { - page = idx == idx0 ? page0 : grab_cache_page(mapping, idx); - - if (!page) - continue; - - page_off = (loff_t)idx << PAGE_SHIFT; - to = (page_off + PAGE_SIZE) > end ? (end - page_off) - : PAGE_SIZE; - partial = false; - - if ((from || PAGE_SIZE != to) && - likely(!page_has_buffers(page))) { - create_empty_buffers(page, blocksize, 0); - } - - if (page_has_buffers(page)) { - struct buffer_head *head, *bh; - u32 bh_off = 0; - - bh = head = page_buffers(page); - do { - u32 bh_next = bh_off + blocksize; - - if (from <= bh_off && bh_next <= to) { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } else if (!buffer_uptodate(bh)) { - partial = true; - } - bh_off = bh_next; - } while (head != (bh = bh->b_this_page)); - } - - zero_user_segment(page, from, to); - - if (!partial) { - if (!PageUptodate(page)) - SetPageUptodate(page); - set_page_dirty(page); - } - - if (idx != idx0) { - unlock_page(page); - put_page(page); - } - cond_resched(); - } - mark_inode_dirty(inode); -} - /* * ntfs_file_mmap - file_operations::mmap */ @@ -385,13 +294,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) for (; vcn < end; vcn += len) { err = attr_data_get_block(ni, vcn, 1, &lcn, - &len, &new); + &len, &new, true); if (err) goto out; - - if (!new) - continue; - ntfs_sparse_cluster(inode, NULL, vcn, 1); } } @@ -432,7 +337,6 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, err = ntfs_set_size(inode, end); if (err) goto out; - inode->i_size = end; } if (extend_init && !is_compressed(ni)) { @@ -535,7 +439,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); loff_t end = vbo + len; - loff_t vbo_down = round_down(vbo, PAGE_SIZE); + loff_t vbo_down = round_down(vbo, max_t(unsigned long, + sbi->cluster_size, PAGE_SIZE)); bool is_supported_holes = is_sparsed(ni) || is_compressed(ni); loff_t i_size, new_size; bool map_locked; @@ -588,11 +493,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) u32 frame_size; loff_t mask, vbo_a, end_a, tmp; - err = filemap_write_and_wait_range(mapping, vbo, end - 1); - if (err) - goto out; - - err = filemap_write_and_wait_range(mapping, end, LLONG_MAX); + err = filemap_write_and_wait_range(mapping, vbo_down, + LLONG_MAX); if (err) goto out; @@ -685,47 +587,45 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) if (err) goto out; - /* - * Allocate clusters, do not change 'valid' size. - */ - err = ntfs_set_size(inode, new_size); - if (err) - goto out; + if (new_size > i_size) { + /* + * Allocate clusters, do not change 'valid' size. + */ + err = ntfs_set_size(inode, new_size); + if (err) + goto out; + } if (is_supported_holes) { - CLST vcn_v = ni->i_valid >> sbi->cluster_bits; CLST vcn = vbo >> sbi->cluster_bits; CLST cend = bytes_to_cluster(sbi, end); + CLST cend_v = bytes_to_cluster(sbi, ni->i_valid); CLST lcn, clen; bool new; + if (cend_v > cend) + cend_v = cend; + /* - * Allocate but do not zero new clusters. (see below comments) - * This breaks security: One can read unused on-disk areas. + * Allocate and zero new clusters. * Zeroing these clusters may be too long. - * Maybe we should check here for root rights? + */ + for (; vcn < cend_v; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend_v - vcn, + &lcn, &clen, &new, + true); + if (err) + goto out; + } + /* + * Allocate but not zero new clusters. */ for (; vcn < cend; vcn += clen) { err = attr_data_get_block(ni, vcn, cend - vcn, - &lcn, &clen, &new); + &lcn, &clen, &new, + false); if (err) goto out; - if (!new || vcn >= vcn_v) - continue; - - /* - * Unwritten area. - * NTFS is not able to store several unwritten areas. - * Activate 'ntfs_sparse_cluster' to zero new allocated clusters. - * - * Dangerous in case: - * 1G of sparsed clusters + 1 cluster of data => - * valid_size == 1G + 1 cluster - * fallocate(1G) will zero 1G and this can be very long - * xfstest 016/086 will fail without 'ntfs_sparse_cluster'. - */ - ntfs_sparse_cluster(inode, NULL, vcn, - min(vcn_v - vcn, clen)); } } @@ -736,6 +636,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) &ni->file.run, i_size, &ni->i_valid, true, NULL); ni_unlock(ni); + } else if (new_size > i_size) { + inode->i_size = new_size; } } @@ -779,7 +681,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; if (ia_valid & ATTR_SIZE) { - loff_t oldsize = inode->i_size; + loff_t newsize, oldsize; if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { /* Should never be here, see ntfs_file_open(). */ @@ -787,16 +689,19 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; } inode_dio_wait(inode); + oldsize = inode->i_size; + newsize = attr->ia_size; - if (attr->ia_size <= oldsize) - err = ntfs_truncate(inode, attr->ia_size); - else if (attr->ia_size > oldsize) - err = ntfs_extend(inode, attr->ia_size, 0, NULL); + if (newsize <= oldsize) + err = ntfs_truncate(inode, newsize); + else + err = ntfs_extend(inode, newsize, 0, NULL); if (err) goto out; ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + inode->i_size = newsize; } setattr_copy(mnt_userns, inode, attr); @@ -946,8 +851,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) frame_vbo = valid & ~(frame_size - 1); off = valid & (frame_size - 1); - err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn, - &clen, NULL); + err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 1, &lcn, + &clen, NULL, false); if (err) goto out; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index b1b476fb7229..f1df52dfab74 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -557,7 +557,7 @@ static int ni_repack(struct ntfs_inode *ni) } if (!mi_p) { - /* Do not try if not enogh free space. */ + /* Do not try if not enough free space. */ if (le32_to_cpu(mi->mrec->used) + 8 >= rs) continue; @@ -1645,6 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, { struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; + struct le_str *fns; if (le) *le = NULL; @@ -1668,8 +1669,8 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, if (uni->len != fname->name_len) goto next; - if (ntfs_cmp_names_cpu(uni, (struct le_str *)&fname->name_len, NULL, - false)) + fns = (struct le_str *)&fname->name_len; + if (ntfs_cmp_names_cpu(uni, fns, NULL, false)) goto next; return fname; @@ -2223,7 +2224,7 @@ int ni_decompress_file(struct ntfs_inode *ni) for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) { err = attr_data_get_block(ni, vcn, cend - vcn, &lcn, - &clen, &new); + &clen, &new, false); if (err) goto out; } @@ -3011,6 +3012,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de) { int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; @@ -3018,6 +3020,19 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); u16 de_key_size = le16_to_cpu(de->key_size); + if (sbi->options->windows_names && + !valid_windows_name(sbi, (struct le_str *)&de_name->name_len)) + return -EINVAL; + + /* If option "hide_dot_files" then set hidden attribute for dot files. */ + if (ni->mi.sbi->options->hide_dot_files) { + if (de_name->name_len > 0 && + le16_to_cpu(de_name->name[0]) == '.') + ni->std_fa |= FILE_ATTRIBUTE_HIDDEN; + else + ni->std_fa &= ~FILE_ATTRIBUTE_HIDDEN; + } + mi_get_ref(&ni->mi, &de->ref); mi_get_ref(&dir_ni->mi, &de_name->home); @@ -3036,7 +3051,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size); /* Insert new name into directory. */ - err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0); + err = indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 0); if (err) ni_remove_attr_le(ni, attr, mi, le); @@ -3279,6 +3294,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } + /* std attribute is always in primary MFT record. */ if (modified) ni->mi.dirty = true; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index c662d2a51907..c6eb371a3695 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3029,7 +3029,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, struct NEW_ATTRIBUTE_SIZES *new_sz; struct ATTR_FILE_NAME *fname; struct OpenAttr *oa, *oa2; - u32 nsize, t32, asize, used, esize, bmp_off, bmp_bits; + u32 nsize, t32, asize, used, esize, off, bits; u16 id, id2; u32 record_size = sbi->record_size; u64 t64; @@ -3616,30 +3616,28 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, break; case SetBitsInNonresidentBitMap: - bmp_off = - le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); - bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); + off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); + bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); - if (cbo + (bmp_off + 7) / 8 > lco || - cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) { + if (cbo + (off + 7) / 8 > lco || + cbo + ((off + bits + 7) / 8) > lco) { goto dirty_vol; } - __bitmap_set(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits); + ntfs_bitmap_set_le(Add2Ptr(buffer_le, roff), off, bits); a_dirty = true; break; case ClearBitsInNonresidentBitMap: - bmp_off = - le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); - bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); + off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); + bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); - if (cbo + (bmp_off + 7) / 8 > lco || - cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) { + if (cbo + (off + 7) / 8 > lco || + cbo + ((off + bits + 7) / 8) > lco) { goto dirty_vol; } - __bitmap_clear(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits); + ntfs_bitmap_clear_le(Add2Ptr(buffer_le, roff), off, bits); a_dirty = true; break; @@ -4826,8 +4824,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) goto out; } attr = oa->attr; - t64 = le64_to_cpu(attr->nres.alloc_size); - if (size > t64) { + if (size > le64_to_cpu(attr->nres.alloc_size)) { attr->nres.valid_size = attr->nres.data_size = attr->nres.alloc_size = cpu_to_le64(size); } diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index b6e22bcb929b..567563771bf8 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -98,6 +98,30 @@ const __le16 WOF_NAME[17] = { }; #endif +static const __le16 CON_NAME[3] = { + cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('N'), +}; + +static const __le16 NUL_NAME[3] = { + cpu_to_le16('N'), cpu_to_le16('U'), cpu_to_le16('L'), +}; + +static const __le16 AUX_NAME[3] = { + cpu_to_le16('A'), cpu_to_le16('U'), cpu_to_le16('X'), +}; + +static const __le16 PRN_NAME[3] = { + cpu_to_le16('P'), cpu_to_le16('R'), cpu_to_le16('N'), +}; + +static const __le16 COM_NAME[3] = { + cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('M'), +}; + +static const __le16 LPT_NAME[3] = { + cpu_to_le16('L'), cpu_to_le16('P'), cpu_to_le16('T'), +}; + // clang-format on /* @@ -321,35 +345,6 @@ int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi) return err; } -/* - * ntfs_query_def - * - * Return: Current ATTR_DEF_ENTRY for given attribute type. - */ -const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi, - enum ATTR_TYPE type) -{ - int type_in = le32_to_cpu(type); - size_t min_idx = 0; - size_t max_idx = sbi->def_entries - 1; - - while (min_idx <= max_idx) { - size_t i = min_idx + ((max_idx - min_idx) >> 1); - const struct ATTR_DEF_ENTRY *entry = sbi->def_table + i; - int diff = le32_to_cpu(entry->type) - type_in; - - if (!diff) - return entry; - if (diff < 0) - min_idx = i + 1; - else if (i) - max_idx = i - 1; - else - return NULL; - } - return NULL; -} - /* * ntfs_look_for_free_space - Look for a free space in bitmap. */ @@ -448,6 +443,39 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, return err; } +/* + * ntfs_check_for_free_space + * + * Check if it is possible to allocate 'clen' clusters and 'mlen' Mft records + */ +bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen) +{ + size_t free, zlen, avail; + struct wnd_bitmap *wnd; + + wnd = &sbi->used.bitmap; + down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + free = wnd_zeroes(wnd); + zlen = min_t(size_t, NTFS_MIN_MFT_ZONE, wnd_zone_len(wnd)); + up_read(&wnd->rw_lock); + + if (free < zlen + clen) + return false; + + avail = free - (zlen + clen); + + wnd = &sbi->mft.bitmap; + down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); + free = wnd_zeroes(wnd); + zlen = wnd_zone_len(wnd); + up_read(&wnd->rw_lock); + + if (free >= zlen + mlen) + return true; + + return avail >= bytes_to_cluster(sbi, mlen << sbi->record_bits); +} + /* * ntfs_extend_mft - Allocate additional MFT records. * @@ -475,7 +503,7 @@ static int ntfs_extend_mft(struct ntfs_sb_info *sbi) struct ATTRIB *attr; struct wnd_bitmap *wnd = &sbi->mft.bitmap; - new_mft_total = (wnd->nbits + MFT_INCREASE_CHUNK + 127) & (CLST)~127; + new_mft_total = ALIGN(wnd->nbits + NTFS_MFT_INCREASE_STEP, 128); new_mft_bytes = (u64)new_mft_total << sbi->record_bits; /* Step 1: Resize $MFT::DATA. */ @@ -618,13 +646,13 @@ int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, NULL, 0, NULL, NULL)) goto next; - __clear_bit(ir - MFT_REC_RESERVED, + __clear_bit_le(ir - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } } /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */ - zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap, + zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap, MFT_REC_FREE, MFT_REC_RESERVED); if (zbit >= MFT_REC_FREE) { sbi->mft.next_reserved = MFT_REC_FREE; @@ -692,7 +720,7 @@ int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, if (*rno >= MFT_REC_FREE) wnd_set_used(wnd, *rno, 1); else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) - __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + __set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); out: if (!mft) @@ -720,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft) else wnd_set_free(wnd, rno, 1); } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) { - __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + __clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } if (rno < wnd_zone_bit(wnd)) @@ -830,7 +858,6 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) if (!(sbi->flags & NTFS_FLAGS_MFTMIRR)) return; - err = 0; bytes = sbi->mft.recs_mirr << sbi->record_bits; block1 = sbi->mft.lbo >> sb->s_blocksize_bits; block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits; @@ -860,8 +887,7 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) put_bh(bh1); bh1 = NULL; - if (wait) - err = sync_dirty_buffer(bh2); + err = wait ? sync_dirty_buffer(bh2) : 0; put_bh(bh2); if (err) @@ -2504,3 +2530,83 @@ int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim) return 0; } + +static inline bool name_has_forbidden_chars(const struct le_str *fname) +{ + int i, ch; + + /* check for forbidden chars */ + for (i = 0; i < fname->len; ++i) { + ch = le16_to_cpu(fname->name[i]); + + /* control chars */ + if (ch < 0x20) + return true; + + switch (ch) { + /* disallowed by Windows */ + case '\\': + case '/': + case ':': + case '*': + case '?': + case '<': + case '>': + case '|': + case '\"': + return true; + + default: + /* allowed char */ + break; + } + } + + /* file names cannot end with space or . */ + if (fname->len > 0) { + ch = le16_to_cpu(fname->name[fname->len - 1]); + if (ch == ' ' || ch == '.') + return true; + } + + return false; +} + +static inline bool is_reserved_name(struct ntfs_sb_info *sbi, + const struct le_str *fname) +{ + int port_digit; + const __le16 *name = fname->name; + int len = fname->len; + u16 *upcase = sbi->upcase; + + /* check for 3 chars reserved names (device names) */ + /* name by itself or with any extension is forbidden */ + if (len == 3 || (len > 3 && le16_to_cpu(name[3]) == '.')) + if (!ntfs_cmp_names(name, 3, CON_NAME, 3, upcase, false) || + !ntfs_cmp_names(name, 3, NUL_NAME, 3, upcase, false) || + !ntfs_cmp_names(name, 3, AUX_NAME, 3, upcase, false) || + !ntfs_cmp_names(name, 3, PRN_NAME, 3, upcase, false)) + return true; + + /* check for 4 chars reserved names (port name followed by 1..9) */ + /* name by itself or with any extension is forbidden */ + if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) { + port_digit = le16_to_cpu(name[3]); + if (port_digit >= '1' && port_digit <= '9') + if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) || + !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false)) + return true; + } + + return false; +} + +/* + * valid_windows_name - Check if a file name is valid in Windows. + */ +bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *fname) +{ + return !name_has_forbidden_chars(fname) && + !is_reserved_name(sbi, fname); +} diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index c27b4fe57513..51ab75954640 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -47,7 +47,7 @@ static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2, if (l2 < fsize2) return -1; - both_case = f2->type != FILE_NAME_DOS /*&& !sbi->options.nocase*/; + both_case = f2->type != FILE_NAME_DOS && !sbi->options->nocase; if (!l1) { const struct le_str *s2 = (struct le_str *)&f2->name_len; @@ -323,7 +323,7 @@ static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni, if (err) return err; - __set_bit(bit - bbuf.bit, bbuf.buf); + __set_bit_le(bit - bbuf.bit, bbuf.buf); bmp_buf_put(&bbuf, true); @@ -343,7 +343,7 @@ static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni, if (err) return err; - __clear_bit(bit - bbuf.bit, bbuf.buf); + __clear_bit_le(bit - bbuf.bit, bbuf.buf); bmp_buf_put(&bbuf, true); @@ -457,7 +457,7 @@ static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap, static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret) { - size_t pos = find_next_zero_bit(buf, bits, bit); + size_t pos = find_next_zero_bit_le(buf, bits, bit); if (pos >= bits) return false; @@ -489,7 +489,7 @@ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni, if (!b->non_res) { u32 nbits = 8 * le32_to_cpu(b->res.data_size); - size_t pos = find_next_zero_bit(resident_data(b), nbits, 0); + size_t pos = find_next_zero_bit_le(resident_data(b), nbits, 0); if (pos < nbits) *bit = pos; @@ -505,7 +505,7 @@ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni, static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret) { - size_t pos = find_next_bit(buf, bits, bit); + size_t pos = find_next_bit_le(buf, bits, bit); if (pos >= bits) return false; @@ -536,7 +536,7 @@ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit) if (!b->non_res) { u32 nbits = le32_to_cpu(b->res.data_size) * 8; - size_t pos = find_next_bit(resident_data(b), nbits, from); + size_t pos = find_next_bit_le(resident_data(b), nbits, from); if (pos < nbits) *bit = pos; @@ -605,11 +605,58 @@ static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr, return e; } +/* + * index_hdr_check + * + * return true if INDEX_HDR is valid + */ +static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) +{ + u32 end = le32_to_cpu(hdr->used); + u32 tot = le32_to_cpu(hdr->total); + u32 off = le32_to_cpu(hdr->de_off); + + if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || + off + sizeof(struct NTFS_DE) > end) { + /* incorrect index buffer. */ + return false; + } + + return true; +} + +/* + * index_buf_check + * + * return true if INDEX_BUFFER seems is valid + */ +static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes, + const CLST *vbn) +{ + const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr; + u16 fo = le16_to_cpu(rhdr->fix_off); + u16 fn = le16_to_cpu(rhdr->fix_num); + + if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) || + rhdr->sign != NTFS_INDX_SIGNATURE || + fo < sizeof(struct INDEX_BUFFER) + /* Check index buffer vbn. */ + || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) || + fo + fn * sizeof(short) >= bytes || + fn != ((bytes >> SECTOR_SHIFT) + 1)) { + /* incorrect index buffer. */ + return false; + } + + return index_hdr_check(&ib->ihdr, + bytes - offsetof(struct INDEX_BUFFER, ihdr)); +} + void fnd_clear(struct ntfs_fnd *fnd) { int i; - for (i = 0; i < fnd->level; i++) { + for (i = fnd->level - 1; i >= 0; i--) { struct indx_node *n = fnd->nodes[i]; if (!n) @@ -625,9 +672,8 @@ void fnd_clear(struct ntfs_fnd *fnd) static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n, struct NTFS_DE *e) { - int i; + int i = fnd->level; - i = fnd->level; if (i < 0 || i >= ARRAY_SIZE(fnd->nodes)) return -EINVAL; fnd->nodes[i] = n; @@ -820,9 +866,16 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, u32 t32; const struct INDEX_ROOT *root = resident_data(attr); + t32 = le32_to_cpu(attr->res.data_size); + if (t32 <= offsetof(struct INDEX_ROOT, ihdr) || + !index_hdr_check(&root->ihdr, + t32 - offsetof(struct INDEX_ROOT, ihdr))) { + goto out; + } + /* Check root fields. */ if (!root->index_block_clst) - return -EINVAL; + goto out; indx->type = type; indx->idx2vbn_bits = __ffs(root->index_block_clst); @@ -834,19 +887,19 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, if (t32 < sbi->cluster_size) { /* Index record is smaller than a cluster, use 512 blocks. */ if (t32 != root->index_block_clst * SECTOR_SIZE) - return -EINVAL; + goto out; /* Check alignment to a cluster. */ if ((sbi->cluster_size >> SECTOR_SHIFT) & (root->index_block_clst - 1)) { - return -EINVAL; + goto out; } indx->vbn2vbo_bits = SECTOR_SHIFT; } else { /* Index record must be a multiple of cluster size. */ if (t32 != root->index_block_clst << sbi->cluster_bits) - return -EINVAL; + goto out; indx->vbn2vbo_bits = sbi->cluster_bits; } @@ -854,7 +907,14 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, init_rwsem(&indx->run_lock); indx->cmp = get_cmp_func(root); - return indx->cmp ? 0 : -EINVAL; + if (!indx->cmp) + goto out; + + return 0; + +out: + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + return -EINVAL; } static struct indx_node *indx_new(struct ntfs_index *indx, @@ -1012,6 +1072,13 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, goto out; ok: + if (!index_buf_check(ib, bytes, &vbn)) { + ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + goto out; + } + if (err == -E_NTFS_FIXUP) { ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0); err = 0; @@ -1347,8 +1414,8 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, run_init(&run); - err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0, - NULL); + err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF, + &alen, 0, NULL, NULL); if (err) goto out; @@ -1446,6 +1513,9 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, goto out1; } + if (in->name == I30_NAME) + ni->vfs_inode.i_size = data_size; + *vbn = bit << indx->idx2vbn_bits; return 0; @@ -1599,9 +1669,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, if (err) { /* Restore root. */ - if (mi_resize_attr(mi, attr, -ds_root)) + if (mi_resize_attr(mi, attr, -ds_root)) { memcpy(attr, a_root, asize); - else { + } else { /* Bug? */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } @@ -1953,7 +2023,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, if (bit >= nbits) return 0; - pos = find_next_bit(bm, nbits, bit); + pos = find_next_bit_le(bm, nbits, bit); if (pos < nbits) return 0; } else { @@ -1979,6 +2049,9 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, if (err) return err; + if (in->name == I30_NAME) + ni->vfs_inode.i_size = new_data; + bpb = bitmap_size(bit); if (bpb * 8 == nbits) return 0; @@ -2121,9 +2194,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, fnd->de[level] = e; indx_write(indx, ni, n, 0); - /* Check to see if this action created an empty leaf. */ - if (ib_is_leaf(ib) && ib_is_empty(ib)) + if (ib_is_leaf(ib) && ib_is_empty(ib)) { + /* An empty leaf. */ return 0; + } out: fnd_clear(fnd); @@ -2461,6 +2535,9 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); + if (in->name == I30_NAME) + ni->vfs_inode.i_size = 0; + err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); run_close(&indx->alloc_run); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 22152300e60c..31bc94f87940 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -81,7 +81,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, le16_to_cpu(ref->seq), le16_to_cpu(rec->seq)); goto out; } else if (!is_rec_inuse(rec)) { - err = -EINVAL; + err = -ESTALE; ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino); goto out; } @@ -92,8 +92,10 @@ static struct inode *ntfs_read_mft(struct inode *inode, goto out; } - if (!is_rec_base(rec)) - goto Ok; + if (!is_rec_base(rec)) { + err = -EINVAL; + goto out; + } /* Record should contain $I30 root. */ is_dir = rec->flags & RECORD_FLAG_DIR; @@ -466,7 +468,6 @@ static struct inode *ntfs_read_mft(struct inode *inode, inode->i_flags |= S_NOSEC; } -Ok: if (ino == MFT_REC_MFT && !sb->s_root) sbi->mft.ni = NULL; @@ -520,6 +521,9 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, _ntfs_bad_inode(inode); } + if (IS_ERR(inode) && name) + ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR); + return inode; } @@ -551,17 +555,6 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, clear_buffer_new(bh); clear_buffer_uptodate(bh); - /* Direct write uses 'create=0'. */ - if (!create && vbo >= ni->i_valid) { - /* Out of valid. */ - return 0; - } - - if (vbo >= inode->i_size) { - /* Out of size. */ - return 0; - } - if (is_resident(ni)) { ni_lock(ni); err = attr_data_read_resident(ni, page); @@ -577,7 +570,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, off = vbo & sbi->cluster_mask; new = false; - err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL); + err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL, + create && sbi->cluster_size > PAGE_SIZE); if (err) goto out; @@ -595,11 +589,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, WARN_ON(1); } - if (new) { + if (new) set_buffer_new(bh); - if ((len << cluster_bits) > block_size) - ntfs_sparse_cluster(inode, page, vcn, len); - } lbo = ((u64)lcn << cluster_bits) + off; @@ -627,7 +618,6 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, } } else if (vbo >= valid) { /* Read out of valid data. */ - /* Should never be here 'cause already checked. */ clear_buffer_mapped(bh); } else if (vbo + bytes <= valid) { /* Normal read. */ @@ -646,9 +636,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); - err = bh_read(bh, 0); - if (err < 0) + + lock_buffer(bh); + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ, bh); + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + err = -EIO; goto out; + } zero_user_segment(page, off + voff, off + block_size); } } @@ -969,6 +967,11 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, dirty = true; } + if (pos + err > inode->i_size) { + inode->i_size = pos + err; + dirty = true; + } + if (dirty) mark_inode_dirty(inode); } @@ -1178,6 +1181,18 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, return ERR_PTR(err); } +/* + * ntfs_create_inode + * + * Helper function for: + * - ntfs_create + * - ntfs_mknod + * - ntfs_symlink + * - ntfs_mkdir + * - ntfs_atomic_open + * + * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked + */ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, @@ -1207,7 +1222,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, struct REPARSE_DATA_BUFFER *rp = NULL; bool rp_inserted = false; - ni_lock_dir(dir_ni); + if (!fnd) + ni_lock_dir(dir_ni); dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); if (!dir_root) { @@ -1270,6 +1286,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, fa = FILE_ATTRIBUTE_ARCHIVE; } + /* If option "hide_dot_files" then set hidden attribute for dot files. */ + if (sbi->options->hide_dot_files && name->name[0] == '.') + fa |= FILE_ATTRIBUTE_HIDDEN; + if (!(mode & 0222)) fa |= FILE_ATTRIBUTE_READONLY; @@ -1355,6 +1375,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, mi_get_ref(&ni->mi, &new_de->ref); fname = (struct ATTR_FILE_NAME *)(new_de + 1); + + if (sbi->options->windows_names && + !valid_windows_name(sbi, (struct le_str *)&fname->name_len)) { + err = -EINVAL; + goto out4; + } + mi_get_ref(&dir_ni->mi, &fname->home); fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time = fname->dup.a_time = std5->cr_time; @@ -1518,8 +1545,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, cpu_to_le64(ntfs_up_cluster(sbi, nsize)); err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0, - clst, NULL, 0, &alen, 0, - NULL); + clst, NULL, ALLOCATE_DEF, + &alen, 0, NULL, NULL); if (err) goto out5; @@ -1566,7 +1593,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, goto out6; /* Unlock parent directory before ntfs_init_acl. */ - ni_unlock(dir_ni); + if (!fnd) + ni_unlock(dir_ni); inode->i_generation = le16_to_cpu(rec->seq); @@ -1626,7 +1654,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, out7: /* Undo 'indx_insert_entry'. */ - ni_lock_dir(dir_ni); + if (!fnd) + ni_lock_dir(dir_ni); indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, le16_to_cpu(new_de->key_size), sbi); /* ni_unlock(dir_ni); will be called later. */ @@ -1635,10 +1664,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); out5: - if (S_ISDIR(mode) || run_is_empty(&ni->file.run)) - goto out4; - - run_deallocate(sbi, &ni->file.run, false); + if (!S_ISDIR(mode)) + run_deallocate(sbi, &ni->file.run, false); out4: clear_rec_inuse(rec); @@ -1654,7 +1681,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, out1: if (err) { - ni_unlock(dir_ni); + if (!fnd) + ni_unlock(dir_ni); return ERR_PTR(err); } @@ -1762,7 +1790,103 @@ void ntfs_evict_inode(struct inode *inode) ni_clear(ntfs_i(inode)); } -static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, +/* + * ntfs_translate_junction + * + * Translate a Windows junction target to the Linux equivalent. + * On junctions, targets are always absolute (they include the drive + * letter). We have no way of knowing if the target is for the current + * mounted device or not so we just assume it is. + */ +static int ntfs_translate_junction(const struct super_block *sb, + const struct dentry *link_de, char *target, + int target_len, int target_max) +{ + int tl_len, err = target_len; + char *link_path_buffer = NULL, *link_path; + char *translated = NULL; + char *target_start; + int copy_len; + + link_path_buffer = kmalloc(PATH_MAX, GFP_NOFS); + if (!link_path_buffer) { + err = -ENOMEM; + goto out; + } + /* Get link path, relative to mount point */ + link_path = dentry_path_raw(link_de, link_path_buffer, PATH_MAX); + if (IS_ERR(link_path)) { + ntfs_err(sb, "Error getting link path"); + err = -EINVAL; + goto out; + } + + translated = kmalloc(PATH_MAX, GFP_NOFS); + if (!translated) { + err = -ENOMEM; + goto out; + } + + /* Make translated path a relative path to mount point */ + strcpy(translated, "./"); + ++link_path; /* Skip leading / */ + for (tl_len = sizeof("./") - 1; *link_path; ++link_path) { + if (*link_path == '/') { + if (PATH_MAX - tl_len < sizeof("../")) { + ntfs_err(sb, + "Link path %s has too many components", + link_path); + err = -EINVAL; + goto out; + } + strcpy(translated + tl_len, "../"); + tl_len += sizeof("../") - 1; + } + } + + /* Skip drive letter */ + target_start = target; + while (*target_start && *target_start != ':') + ++target_start; + + if (!*target_start) { + ntfs_err(sb, "Link target (%s) missing drive separator", + target); + err = -EINVAL; + goto out; + } + + /* Skip drive separator and leading /, if exists */ + target_start += 1 + (target_start[1] == '/'); + copy_len = target_len - (target_start - target); + + if (PATH_MAX - tl_len <= copy_len) { + ntfs_err(sb, "Link target %s too large for buffer (%d <= %d)", + target_start, PATH_MAX - tl_len, copy_len); + err = -EINVAL; + goto out; + } + + /* translated path has a trailing / and target_start does not */ + strcpy(translated + tl_len, target_start); + tl_len += copy_len; + if (target_max <= tl_len) { + ntfs_err(sb, "Target path %s too large for buffer (%d <= %d)", + translated, target_max, tl_len); + err = -EINVAL; + goto out; + } + strcpy(target, translated); + err = tl_len; + +out: + kfree(link_path_buffer); + kfree(translated); + return err; +} + +static noinline int ntfs_readlink_hlp(const struct dentry *link_de, + struct inode *inode, char *buffer, int buflen) { int i, err = -EINVAL; @@ -1905,6 +2029,11 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, /* Always set last zero. */ buffer[err] = 0; + + /* If this is a junction, translate the link target. */ + if (rp->ReparseTag == IO_REPARSE_TAG_MOUNT_POINT) + err = ntfs_translate_junction(sb, link_de, buffer, err, buflen); + out: kfree(to_free); return err; @@ -1923,7 +2052,7 @@ static const char *ntfs_get_link(struct dentry *de, struct inode *inode, if (!ret) return ERR_PTR(-ENOMEM); - err = ntfs_readlink_hlp(inode, ret, PAGE_SIZE); + err = ntfs_readlink_hlp(de, inode, ret, PAGE_SIZE); if (err < 0) { kfree(ret); return ERR_PTR(err); @@ -1940,6 +2069,8 @@ const struct inode_operations ntfs_link_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, }; const struct address_space_operations ntfs_aops = { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index bc22cc321a74..0e72d2067804 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -7,6 +7,8 @@ #include #include +#include +#include #include "debug.h" #include "ntfs.h" @@ -303,6 +305,8 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, ni_lock_dir(dir_ni); ni_lock(ni); + if (dir_ni != new_dir_ni) + ni_lock_dir2(new_dir_ni); is_bad = false; err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad); @@ -326,6 +330,8 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, ntfs_sync_inode(inode); } + if (dir_ni != new_dir_ni) + ni_unlock(new_dir_ni); ni_unlock(ni); ni_unlock(dir_ni); out: @@ -333,6 +339,104 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, return err; } +/* + * ntfs_atomic_open + * + * inode_operations::atomic_open + */ +static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, u32 flags, umode_t mode) +{ + int err; + struct inode *inode; + struct ntfs_fnd *fnd = NULL; + struct ntfs_inode *ni = ntfs_i(dir); + struct dentry *d = NULL; + struct cpu_str *uni = __getname(); + bool locked = false; + + if (!uni) + return -ENOMEM; + + err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name, + dentry->d_name.len, uni, NTFS_NAME_LEN, + UTF16_HOST_ENDIAN); + if (err < 0) + goto out; + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if (IS_POSIXACL(dir)) { + /* + * Load in cache current acl to avoid ni_lock(dir): + * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create -> + * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock + */ + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto out; + } + posix_acl_release(p); + } +#endif + + if (d_in_lookup(dentry)) { + ni_lock_dir(ni); + locked = true; + fnd = fnd_get(); + if (!fnd) { + err = -ENOMEM; + goto out1; + } + + d = d_splice_alias(dir_search_u(dir, uni, fnd), dentry); + if (IS_ERR(d)) { + err = PTR_ERR(d); + d = NULL; + goto out2; + } + + if (d) + dentry = d; + } + + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) { + err = finish_no_open(file, d); + goto out2; + } + + file->f_mode |= FMODE_CREATED; + + /* + * fnd contains tree's path to insert to. + * If fnd is not NULL then dir is locked. + */ + + /* + * Unfortunately I don't know how to get here correct 'struct nameidata *nd' + * or 'struct user_namespace *mnt_userns'. + * See atomic_open in fs/namei.c. + * This is why xfstest/633 failed. + * Looks like ntfs_atomic_open must accept 'struct user_namespace *mnt_userns' as argument. + */ + + inode = ntfs_create_inode(&init_user_ns, dir, dentry, uni, mode, 0, + NULL, 0, fnd); + err = IS_ERR(inode) ? PTR_ERR(inode) + : finish_open(file, dentry, ntfs_file_open); + dput(d); + +out2: + fnd_put(fnd); +out1: + if (locked) + ni_unlock(ni); +out: + __putname(uni); + return err; +} + struct dentry *ntfs3_get_parent(struct dentry *child) { struct inode *inode = d_inode(child); @@ -355,6 +459,133 @@ struct dentry *ntfs3_get_parent(struct dentry *child) return ERR_PTR(-ENOENT); } +/* + * dentry_operations::d_hash + */ +static int ntfs_d_hash(const struct dentry *dentry, struct qstr *name) +{ + struct ntfs_sb_info *sbi; + const char *n = name->name; + unsigned int len = name->len; + unsigned long hash; + struct cpu_str *uni; + unsigned int c; + int err; + + /* First try fast implementation. */ + hash = init_name_hash(dentry); + + for (;;) { + if (!len--) { + name->hash = end_name_hash(hash); + return 0; + } + + c = *n++; + if (c >= 0x80) + break; + + hash = partial_name_hash(toupper(c), hash); + } + + /* + * Try slow way with current upcase table + */ + uni = __getname(); + if (!uni) + return -ENOMEM; + + sbi = dentry->d_sb->s_fs_info; + + err = ntfs_nls_to_utf16(sbi, name->name, name->len, uni, NTFS_NAME_LEN, + UTF16_HOST_ENDIAN); + if (err < 0) + goto out; + + if (!err) { + err = -EINVAL; + goto out; + } + + hash = ntfs_names_hash(uni->name, uni->len, sbi->upcase, + init_name_hash(dentry)); + name->hash = end_name_hash(hash); + err = 0; + +out: + __putname(uni); + return err; +} + +/* + * dentry_operations::d_compare + */ +static int ntfs_d_compare(const struct dentry *dentry, unsigned int len1, + const char *str, const struct qstr *name) +{ + struct ntfs_sb_info *sbi; + int ret; + const char *n1 = str; + const char *n2 = name->name; + unsigned int len2 = name->len; + unsigned int lm = min(len1, len2); + unsigned char c1, c2; + struct cpu_str *uni1; + struct le_str *uni2; + + /* First try fast implementation. */ + for (;;) { + if (!lm--) + return len1 != len2; + + if ((c1 = *n1++) == (c2 = *n2++)) + continue; + + if (c1 >= 0x80 || c2 >= 0x80) + break; + + if (toupper(c1) != toupper(c2)) + return 1; + } + + /* + * Try slow way with current upcase table + */ + sbi = dentry->d_sb->s_fs_info; + uni1 = __getname(); + if (!uni1) + return -ENOMEM; + + ret = ntfs_nls_to_utf16(sbi, str, len1, uni1, NTFS_NAME_LEN, + UTF16_HOST_ENDIAN); + if (ret < 0) + goto out; + + if (!ret) { + ret = -EINVAL; + goto out; + } + + uni2 = Add2Ptr(uni1, 2048); + + ret = ntfs_nls_to_utf16(sbi, name->name, name->len, + (struct cpu_str *)uni2, NTFS_NAME_LEN, + UTF16_LITTLE_ENDIAN); + if (ret < 0) + goto out; + + if (!ret) { + ret = -EINVAL; + goto out; + } + + ret = !ntfs_cmp_names_cpu(uni1, uni2, sbi->upcase, false) ? 0 : 1; + +out: + __putname(uni1); + return ret; +} + // clang-format off const struct inode_operations ntfs_dir_inode_operations = { .lookup = ntfs_lookup, @@ -372,6 +603,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .setattr = ntfs3_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, + .atomic_open = ntfs_atomic_open, .fiemap = ntfs_fiemap, }; @@ -382,4 +614,10 @@ const struct inode_operations ntfs_special_inode_operations = { .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, }; + +const struct dentry_operations ntfs_dentry_ops = { + .d_hash = ntfs_d_hash, + .d_compare = ntfs_d_compare, +}; + // clang-format on diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 9cc396b117bf..86ea1826d099 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -84,7 +84,6 @@ typedef u32 CLST; #define COMPRESSION_UNIT 4 #define COMPRESS_MAX_CLUSTER 0x1000 -#define MFT_INCREASE_CHUNK 1024 enum RECORD_NUM { MFT_REC_MFT = 0, @@ -715,12 +714,13 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) { u32 de_off = le32_to_cpu(hdr->de_off); u32 used = le32_to_cpu(hdr->used); - struct NTFS_DE *e = Add2Ptr(hdr, de_off); + struct NTFS_DE *e; u16 esize; - if (de_off >= used || de_off >= le32_to_cpu(hdr->total)) + if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used ) return NULL; + e = Add2Ptr(hdr, de_off); esize = le16_to_cpu(e->size); if (esize < sizeof(struct NTFS_DE) || de_off + esize > used) return NULL; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 2c791222c4e2..5cefcfa52118 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -97,9 +97,12 @@ struct ntfs_mount_options { unsigned sparse : 1; /* Create sparse files. */ unsigned showmeta : 1; /* Show meta files. */ unsigned nohidden : 1; /* Do not show hidden files. */ + unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */ + unsigned windows_names : 1; /* Disallow names forbidden by Windows. */ unsigned force : 1; /* RW mount dirty volume. */ unsigned noacsrules : 1; /* Exclude acs rules. */ unsigned prealloc : 1; /* Preallocate space when file is growing. */ + unsigned nocase : 1; /* case insensitive. */ }; /* Special value to unpack and deallocate. */ @@ -124,6 +127,7 @@ struct ntfs_buffers { enum ALLOCATE_OPT { ALLOCATE_DEF = 0, // Allocate all clusters. ALLOCATE_MFT = 1, // Allocate for MFT. + ALLOCATE_ZERO = 2, // Zeroout new allocated clusters }; enum bitmap_mutex_classes { @@ -195,6 +199,8 @@ struct ntfs_index { /* Minimum MFT zone. */ #define NTFS_MIN_MFT_ZONE 100 +/* Step to increase the MFT. */ +#define NTFS_MFT_INCREASE_STEP 1024 /* Ntfs file system in-core superblock data. */ struct ntfs_sb_info { @@ -330,6 +336,7 @@ enum ntfs_inode_mutex_lock_class { NTFS_INODE_MUTEX_REPARSE, NTFS_INODE_MUTEX_NORMAL, NTFS_INODE_MUTEX_PARENT, + NTFS_INODE_MUTEX_PARENT2, }; /* @@ -412,7 +419,7 @@ enum REPARSE_SIGN { int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, - CLST *new_lcn); + CLST *new_lcn, CLST *new_len); int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, @@ -422,7 +429,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, u64 new_size, const u64 *new_valid, bool keep_prealloc, struct ATTRIB **ret); int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, - CLST *len, bool *new); + CLST *len, bool *new, bool zero); int attr_data_read_resident(struct ntfs_inode *ni, struct page *page); int attr_data_write_resident(struct ntfs_inode *ni, struct page *page); int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, @@ -469,9 +476,9 @@ static inline size_t al_aligned(size_t size) } /* Globals from bitfunc.c */ -bool are_bits_clear(const ulong *map, size_t bit, size_t nbits); -bool are_bits_set(const ulong *map, size_t bit, size_t nbits); -size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); +bool are_bits_clear(const void *map, size_t bit, size_t nbits); +bool are_bits_set(const void *map, size_t bit, size_t nbits); +size_t get_set_bits_ex(const void *map, size_t bit, size_t nbits); /* Globals from dir.c */ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, @@ -487,8 +494,6 @@ extern const struct file_operations ntfs_dir_operations; /* Globals from file.c */ int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); -void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, - CLST len); int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr); int ntfs_file_open(struct inode *inode, struct file *file); @@ -582,11 +587,10 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); int ntfs_extend_init(struct ntfs_sb_info *sbi); int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi); -const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi, - enum ATTR_TYPE Type); int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, CLST *new_lcn, CLST *new_len, enum ALLOCATE_OPT opt); +bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen); int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, struct ntfs_inode *ni, struct mft_inode **mi); void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft); @@ -643,6 +647,7 @@ int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref); void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim); int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim); +bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name); /* Globals from index.c */ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit); @@ -720,6 +725,7 @@ struct dentry *ntfs3_get_parent(struct dentry *child); extern const struct inode_operations ntfs_dir_inode_operations; extern const struct inode_operations ntfs_special_inode_operations; +extern const struct dentry_operations ntfs_dentry_ops; /* Globals from record.c */ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); @@ -793,12 +799,12 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, u32 run_buf_size, CLST *packed_vcns); int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, - u32 run_buf_size); + int run_buf_size); #ifdef NTFS3_CHECK_FREE_CLST int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, - u32 run_buf_size); + int run_buf_size); #else #define run_unpack_ex run_unpack #endif @@ -822,6 +828,8 @@ static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd) int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits); int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); +int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, + size_t *done); bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); @@ -834,11 +842,17 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits); void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len); int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range); +void ntfs_bitmap_set_le(void *map, unsigned int start, int len); +void ntfs_bitmap_clear_le(void *map, unsigned int start, int len); +unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits); + /* Globals from upcase.c */ int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, const u16 *upcase, bool bothcase); int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, const u16 *upcase, bool bothcase); +unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, + unsigned long hash); /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL @@ -1113,6 +1127,11 @@ static inline void ni_lock_dir(struct ntfs_inode *ni) mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT); } +static inline void ni_lock_dir2(struct ntfs_inode *ni) +{ + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT2); +} + static inline void ni_unlock(struct ntfs_inode *ni) { mutex_unlock(&ni->ni_lock); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index af1e4b364ea8..a45096dba6bc 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -225,6 +225,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return NULL; } + if (off + asize < off) { + /* overflow check */ + return NULL; + } + attr = Add2Ptr(attr, asize); off += asize; } @@ -265,10 +270,9 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t16 + t32 > asize) return NULL; - if (attr->name_len && - le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > t16) { + t32 = sizeof(short) * attr->name_len; + if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) return NULL; - } return attr; } @@ -547,6 +551,10 @@ bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes) return true; } +/* + * Pack runs in MFT record. + * If failed record is not changed. + */ int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, struct runs_tree *run, CLST len) { diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index aaaa0d3d35a2..a5af71cd8d14 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -919,12 +919,15 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, */ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, - u32 run_buf_size) + int run_buf_size) { u64 prev_lcn, vcn64, lcn, next_vcn; const u8 *run_last, *run_0; bool is_mft = ino == MFT_REC_MFT; + if (run_buf_size < 0) + return -EINVAL; + /* Check for empty. */ if (evcn + 1 == svcn) return 0; @@ -1046,7 +1049,7 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, */ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, - u32 run_buf_size) + int run_buf_size) { int ret, err; CLST next_vcn, lcn, len; @@ -1093,25 +1096,8 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, if (down_write_trylock(&wnd->rw_lock)) { /* Mark all zero bits as used in range [lcn, lcn+len). */ - CLST i, lcn_f = 0, len_f = 0; - - err = 0; - for (i = 0; i < len; i++) { - if (wnd_is_free(wnd, lcn + i, 1)) { - if (!len_f) - lcn_f = lcn + i; - len_f += 1; - } else if (len_f) { - err = wnd_set_used(wnd, lcn_f, len_f); - len_f = 0; - if (err) - break; - } - } - - if (len_f) - err = wnd_set_used(wnd, lcn_f, len_f); - + size_t done; + err = wnd_set_used_safe(wnd, lcn, len, &done); up_write(&wnd->rw_lock); if (err) return err; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 8e2fe0f69203..ef4ea3f21905 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -21,6 +21,30 @@ * https://docs.microsoft.com/en-us/windows/wsl/file-permissions * It stores uid/gid/mode/dev in xattr * + * ntfs allows up to 2^64 clusters per volume. + * It means you should use 64 bits lcn to operate with ntfs. + * Implementation of ntfs.sys uses only 32 bits lcn. + * Default ntfs3 uses 32 bits lcn too. + * ntfs3 built with CONFIG_NTFS3_64BIT_CLUSTER (ntfs3_64) uses 64 bits per lcn. + * + * + * ntfs limits, cluster size is 4K (2^12) + * ----------------------------------------------------------------------------- + * | Volume size | Clusters | ntfs.sys | ntfs3 | ntfs3_64 | mkntfs | chkdsk | + * ----------------------------------------------------------------------------- + * | < 16T, 2^44 | < 2^32 | yes | yes | yes | yes | yes | + * | > 16T, 2^44 | > 2^32 | no | no | yes | yes | yes | + * ----------------------------------------------------------|------------------ + * + * To mount large volumes as ntfs one should use large cluster size (up to 2M) + * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P + * + * ntfs limits, cluster size is 2M (2^31) + * ----------------------------------------------------------------------------- + * | < 8P, 2^54 | < 2^32 | yes | yes | yes | yes | yes | + * | > 8P, 2^54 | > 2^32 | no | no | yes | yes | yes | + * ----------------------------------------------------------|------------------ + * */ #include @@ -223,11 +247,14 @@ enum Opt { Opt_force, Opt_sparse, Opt_nohidden, + Opt_hide_dot_files, + Opt_windows_names, Opt_showmeta, Opt_acl, Opt_iocharset, Opt_prealloc, Opt_noacsrules, + Opt_nocase, Opt_err, }; @@ -242,10 +269,13 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_flag_no("force", Opt_force), fsparam_flag_no("sparse", Opt_sparse), fsparam_flag_no("hidden", Opt_nohidden), + fsparam_flag_no("hide_dot_files", Opt_hide_dot_files), + fsparam_flag_no("windows_names", Opt_windows_names), fsparam_flag_no("acl", Opt_acl), fsparam_flag_no("showmeta", Opt_showmeta), fsparam_flag_no("prealloc", Opt_prealloc), fsparam_flag_no("acsrules", Opt_noacsrules), + fsparam_flag_no("nocase", Opt_nocase), fsparam_string("iocharset", Opt_iocharset), {} }; @@ -330,6 +360,12 @@ static int ntfs_fs_parse_param(struct fs_context *fc, case Opt_nohidden: opts->nohidden = result.negated ? 1 : 0; break; + case Opt_hide_dot_files: + opts->hide_dot_files = result.negated ? 0 : 1; + break; + case Opt_windows_names: + opts->windows_names = result.negated ? 0 : 1; + break; case Opt_acl: if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL @@ -354,6 +390,9 @@ static int ntfs_fs_parse_param(struct fs_context *fc, case Opt_noacsrules: opts->noacsrules = result.negated ? 1 : 0; break; + case Opt_nocase: + opts->nocase = result.negated ? 1 : 0; + break; default: /* Should not be here unless we forget add case. */ return -EINVAL; @@ -406,27 +445,18 @@ static struct inode *ntfs_alloc_inode(struct super_block *sb) return NULL; memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode)); - mutex_init(&ni->ni_lock); - return &ni->vfs_inode; } -static void ntfs_i_callback(struct rcu_head *head) +static void ntfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); struct ntfs_inode *ni = ntfs_i(inode); mutex_destroy(&ni->ni_lock); - kmem_cache_free(ntfs_inode_cachep, ni); } -static void ntfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ntfs_i_callback); -} - static void init_once(void *foo) { struct ntfs_inode *ni = foo; @@ -519,9 +549,9 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid)); if (opts->fmask) - seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); + seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); if (opts->dmask) - seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); + seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff); if (opts->nls) seq_printf(m, ",iocharset=%s", opts->nls->charset); else @@ -536,6 +566,10 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",showmeta"); if (opts->nohidden) seq_puts(m, ",nohidden"); + if (opts->windows_names) + seq_puts(m, ",windows_names"); + if (opts->hide_dot_files) + seq_puts(m, ",hide_dot_files"); if (opts->force) seq_puts(m, ",force"); if (opts->noacsrules) @@ -592,7 +626,7 @@ static int ntfs_sync_fs(struct super_block *sb, int wait) static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_inode, - .destroy_inode = ntfs_destroy_inode, + .free_inode = ntfs_free_inode, .evict_inode = ntfs_evict_inode, .put_super = ntfs_put_super, .statfs = ntfs_statfs, @@ -896,7 +930,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) struct block_device *bdev = sb->s_bdev; struct inode *inode; struct ntfs_inode *ni; - size_t i, tt; + size_t i, tt, bad_len, bad_frags; CLST vcn, lcn, len; struct ATTRIB *attr; const struct VOLUME_INFO *info; @@ -916,6 +950,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_export_op = &ntfs_export_ops; sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; + sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL; sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); if (IS_ERR(sbi->options->nls)) { @@ -1065,30 +1100,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) sbi->mft.ni = ni; - /* Load $BadClus. */ - ref.low = cpu_to_le32(MFT_REC_BADCLUST); - ref.seq = cpu_to_le16(MFT_REC_BADCLUST); - inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); - if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $BadClus."); - err = PTR_ERR(inode); - goto out; - } - - ni = ntfs_i(inode); - - for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) { - if (lcn == SPARSE_LCN) - continue; - - if (!sbi->bad_clusters) - ntfs_notice(sb, "Volume contains bad blocks"); - - sbi->bad_clusters += len; - } - - iput(inode); - /* Load $Bitmap. */ ref.low = cpu_to_le32(MFT_REC_BITMAP); ref.seq = cpu_to_le16(MFT_REC_BITMAP); @@ -1126,6 +1137,44 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) goto out; + /* Load $BadClus. */ + ref.low = cpu_to_le32(MFT_REC_BADCLUST); + ref.seq = cpu_to_le16(MFT_REC_BADCLUST); + inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $BadClus (%d).", err); + goto out; + } + + ni = ntfs_i(inode); + bad_len = bad_frags = 0; + for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) { + if (lcn == SPARSE_LCN) + continue; + + bad_len += len; + bad_frags += 1; + if (sb_rdonly(sb)) + continue; + + if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) { + /* Bad blocks marked as free in bitmap. */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } + } + if (bad_len) { + /* + * Notice about bad blocks. + * In normal cases these blocks are marked as used in bitmap. + * And we never allocate space in it. + */ + ntfs_notice(sb, + "Volume contains %zu bad blocks in %zu fragments.", + bad_len, bad_frags); + } + iput(inode); + /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); @@ -1489,11 +1538,8 @@ static int __init init_ntfs_fs(void) static void __exit exit_ntfs_fs(void) { - if (ntfs_inode_cachep) { - rcu_barrier(); - kmem_cache_destroy(ntfs_inode_cachep); - } - + rcu_barrier(); + kmem_cache_destroy(ntfs_inode_cachep); unregister_filesystem(&ntfs_fs_type); ntfs3_exit_bitmap(); } diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c index b5e8256fd710..7681eefacb4b 100644 --- a/fs/ntfs3/upcase.c +++ b/fs/ntfs3/upcase.c @@ -102,3 +102,15 @@ int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, diff2 = l1 - l2; return diff2 ? diff2 : diff1; } + +/* Helper function for ntfs_d_hash. */ +unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, + unsigned long hash) +{ + while (len--) { + unsigned int c = upcase_unicode_char(upcase, *name++); + hash = partial_name_hash(c, hash); + } + + return hash; +} diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index ea582b4fe1d9..cfd59bb0f9de 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -15,9 +15,10 @@ #include "ntfs_fs.h" // clang-format off -#define SYSTEM_DOS_ATTRIB "system.dos_attrib" -#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib" -#define SYSTEM_NTFS_SECURITY "system.ntfs_security" +#define SYSTEM_DOS_ATTRIB "system.dos_attrib" +#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib" +#define SYSTEM_NTFS_ATTRIB_BE "system.ntfs_attrib_be" +#define SYSTEM_NTFS_SECURITY "system.ntfs_security" // clang-format on static inline size_t unpacked_ea_size(const struct EA_FULL *ea) @@ -42,28 +43,26 @@ static inline size_t packed_ea_size(const struct EA_FULL *ea) * Assume there is at least one xattr in the list. */ static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes, - const char *name, u8 name_len, u32 *off) + const char *name, u8 name_len, u32 *off, u32 *ea_sz) { - *off = 0; + u32 ea_size; - if (!ea_all || !bytes) + *off = 0; + if (!ea_all) return false; - for (;;) { + for (; *off < bytes; *off += ea_size) { const struct EA_FULL *ea = Add2Ptr(ea_all, *off); - u32 next_off = *off + unpacked_ea_size(ea); - - if (next_off > bytes) - return false; - + ea_size = unpacked_ea_size(ea); if (ea->name_len == name_len && - !memcmp(ea->name, name, name_len)) + !memcmp(ea->name, name, name_len)) { + if (ea_sz) + *ea_sz = ea_size; return true; - - *off = next_off; - if (next_off >= bytes) - return false; + } } + + return false; } /* @@ -74,12 +73,12 @@ static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes, static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, size_t add_bytes, const struct EA_INFO **info) { - int err; + int err = -EINVAL; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr_info, *attr_ea; void *ea_p; - u32 size; + u32 size, off, ea_size; static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA)); @@ -96,24 +95,31 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, *info = resident_data_ex(attr_info, sizeof(struct EA_INFO)); if (!*info) - return -EINVAL; + goto out; /* Check Ea limit. */ size = le32_to_cpu((*info)->size); - if (size > sbi->ea_max_size) - return -EFBIG; + if (size > sbi->ea_max_size) { + err = -EFBIG; + goto out; + } - if (attr_size(attr_ea) > sbi->ea_max_size) - return -EFBIG; + if (attr_size(attr_ea) > sbi->ea_max_size) { + err = -EFBIG; + goto out; + } + + if (!size) { + /* EA info persists, but xattr is empty. Looks like EA problem. */ + goto out; + } /* Allocate memory for packed Ea. */ ea_p = kmalloc(size_add(size, add_bytes), GFP_NOFS); if (!ea_p) return -ENOMEM; - if (!size) { - /* EA info persists, but xattr is empty. Looks like EA problem. */ - } else if (attr_ea->non_res) { + if (attr_ea->non_res) { struct runs_tree run; run_init(&run); @@ -124,24 +130,52 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, run_close(&run); if (err) - goto out; + goto out1; } else { void *p = resident_data_ex(attr_ea, size); - if (!p) { - err = -EINVAL; - goto out; - } + if (!p) + goto out1; memcpy(ea_p, p, size); } memset(Add2Ptr(ea_p, size), 0, add_bytes); + + /* Check all attributes for consistency. */ + for (off = 0; off < size; off += ea_size) { + const struct EA_FULL *ef = Add2Ptr(ea_p, off); + u32 bytes = size - off; + + /* Check if we can use field ea->size. */ + if (bytes < sizeof(ef->size)) + goto out1; + + if (ef->size) { + ea_size = le32_to_cpu(ef->size); + if (ea_size > bytes) + goto out1; + continue; + } + + /* Check if we can use fields ef->name_len and ef->elength. */ + if (bytes < offsetof(struct EA_FULL, name)) + goto out1; + + ea_size = ALIGN(struct_size(ef, name, + 1 + ef->name_len + + le16_to_cpu(ef->elength)), + 4); + if (ea_size > bytes) + goto out1; + } + *ea = ea_p; return 0; -out: +out1: kfree(ea_p); - *ea = NULL; +out: + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); return err; } @@ -163,6 +197,7 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, const struct EA_FULL *ea; u32 off, size; int err; + int ea_size; size_t ret; err = ntfs_read_ea(ni, &ea_all, 0, &info); @@ -175,8 +210,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, size = le32_to_cpu(info->size); /* Enumerate all xattrs. */ - for (ret = 0, off = 0; off < size; off += unpacked_ea_size(ea)) { + for (ret = 0, off = 0; off < size; off += ea_size) { ea = Add2Ptr(ea_all, off); + ea_size = unpacked_ea_size(ea); if (buffer) { if (ret + ea->name_len + 1 > bytes_per_buffer) { @@ -227,7 +263,8 @@ static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len, goto out; /* Enumerate all xattrs. */ - if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off)) { + if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off, + NULL)) { err = -ENODATA; goto out; } @@ -269,7 +306,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, struct EA_FULL *new_ea; struct EA_FULL *ea_all = NULL; size_t add, new_pack; - u32 off, size; + u32 off, size, ea_sz; __le16 size_pack; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; @@ -304,9 +341,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_pack = ea_info.size_pack; } - if (info && find_ea(ea_all, size, name, name_len, &off)) { + if (info && find_ea(ea_all, size, name, name_len, &off, &ea_sz)) { struct EA_FULL *ea; - size_t ea_sz; if (flags & XATTR_CREATE) { err = -EEXIST; @@ -329,8 +365,6 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, if (ea->flags & FILE_NEED_EA) le16_add_cpu(&ea_info.count, -1); - ea_sz = unpacked_ea_size(ea); - le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea)); memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz); @@ -604,10 +638,9 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, err = 0; /* Removing non existed xattr. */ if (!err) { set_cached_acl(inode, type, acl); - if (inode->i_mode != mode) { - inode->i_mode = mode; - mark_inode_dirty(inode); - } + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); } out: @@ -625,6 +658,67 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); } +static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, void *buffer, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + acl = ntfs_get_acl(inode, type, false); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (!acl) + return -ENODATA; + + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + + return err; +} + +static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, const void *value, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EPERM; + + if (!value) { + acl = NULL; + } else { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + err = posix_acl_valid(&init_user_ns, acl); + if (err) + goto release_and_out; + } + } + + err = ntfs_set_acl(mnt_userns, inode, acl, type); + +release_and_out: + posix_acl_release(acl); + return err; +} + /* * ntfs_init_acl - Initialize the ACLs of a new inode. * @@ -720,11 +814,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, { int err; struct ntfs_inode *ni = ntfs_i(inode); - size_t name_len = strlen(name); /* Dispatch request. */ - if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 && - !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) { + if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { /* system.dos_attrib */ if (!buffer) { err = sizeof(u8); @@ -737,8 +829,8 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } - if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 && - !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) { + if (!strcmp(name, SYSTEM_NTFS_ATTRIB) || + !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) { /* system.ntfs_attrib */ if (!buffer) { err = sizeof(u32); @@ -747,12 +839,13 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, } else { err = sizeof(u32); *(u32 *)buffer = le32_to_cpu(ni->std_fa); + if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) + *(u32 *)buffer = cpu_to_be32(*(u32 *)buffer); } goto out; } - if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 && - !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) { + if (!strcmp(name, SYSTEM_NTFS_SECURITY)) { /* system.ntfs_security*/ struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL; size_t sd_size = 0; @@ -791,8 +884,21 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) || + !strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) { + /* TODO: init_user_ns? */ + err = ntfs_xattr_get_acl( + &init_user_ns, inode, + strlen(name) == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + buffer, size); + goto out; + } +#endif /* Deal with NTFS extended attribute. */ - err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); + err = ntfs_get_ea(inode, name, strlen(name), buffer, size, NULL); out: return err; @@ -809,23 +915,24 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler, { int err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); - size_t name_len = strlen(name); enum FILE_ATTRIBUTE new_fa; /* Dispatch request. */ - if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 && - !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) { + if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { if (sizeof(u8) != size) goto out; new_fa = cpu_to_le32(*(u8 *)value); goto set_new_fa; } - if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 && - !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) { + if (!strcmp(name, SYSTEM_NTFS_ATTRIB) || + !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) { if (size != sizeof(u32)) goto out; - new_fa = cpu_to_le32(*(u32 *)value); + if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) + new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value)); + else + new_fa = cpu_to_le32(*(u32 *)value); if (S_ISREG(inode->i_mode)) { /* Process compressed/sparsed in special way. */ @@ -860,8 +967,7 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler, goto out; } - if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 && - !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) { + if (!strcmp(name, SYSTEM_NTFS_SECURITY)) { /* system.ntfs_security*/ __le32 security_id; bool inserted; @@ -903,8 +1009,20 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler, goto out; } +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) || + !strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) { + err = ntfs_xattr_set_acl( + mnt_userns, inode, + strlen(name) == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + value, size); + goto out; + } +#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0); out: inode->i_ctime = current_time(inode); @@ -992,7 +1110,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry) } // clang-format off -static const struct xattr_handler ntfs_other_xattr_handler = { +static const struct xattr_handler ntfs_xattr_handler = { .prefix = "", .get = ntfs_getxattr, .set = ntfs_setxattr, @@ -1000,11 +1118,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = { }; const struct xattr_handler *ntfs_xattr_handlers[] = { -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - &ntfs_other_xattr_handler, + &ntfs_xattr_handler, NULL, }; // clang-format on diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 49d0d4ea63fc..0d56a8d862e8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata( * the busy list. */ bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { bma->datatype |= XFS_ALLOC_USERDATA; if (bma->offset == 0) bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; @@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4c16c8c31fcb..35f574421670 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4666,7 +4666,12 @@ xfs_btree_space_to_height( const unsigned int *limits, unsigned long long leaf_blocks) { - unsigned long long node_blocks = limits[1]; + /* + * The root btree block can have fewer than minrecs pointers in it + * because the tree might not be big enough to require that amount of + * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. + */ + unsigned long long node_blocks = 2; unsigned long long blocks_left = leaf_blocks - 1; unsigned int height = 1; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index eef27858a013..29c4b4ccb909 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -556,7 +556,6 @@ xfs_btree_islastblock( struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); - ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 5362908164b0..01a9e86b3037 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -40,13 +40,12 @@ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 + /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Drop-writes support removed because write error handling cannot trash + * pre-existing delalloc extents in any useful way anymore. We retain the + * definition so that we can reject it as an invalid value in + * xfs_errortag_valid(). */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 @@ -62,7 +61,9 @@ #define XFS_ERRTAG_LARP 39 #define XFS_ERRTAG_DA_LEAF_SPLIT 40 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 -#define XFS_ERRTAG_MAX 42 +#define XFS_ERRTAG_WB_DELAY_MS 42 +#define XFS_ERRTAG_WRITE_DELAY_MS 43 +#define XFS_ERRTAG_MAX 44 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -95,7 +96,6 @@ #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 @@ -109,5 +109,7 @@ #define XFS_RANDOM_LARP 1 #define XFS_RANDOM_DA_LEAF_SPLIT 1 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 +#define XFS_RANDOM_WB_DELAY_MS 3000 +#define XFS_RANDOM_WRITE_DELAY_MS 3000 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3f34bafe18dd..6f7ed9288fe4 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -815,11 +815,136 @@ xfs_refcount_find_right_extents( /* Is this extent valid? */ static inline bool xfs_refc_valid( - struct xfs_refcount_irec *rc) + const struct xfs_refcount_irec *rc) { return rc->rc_startblock != NULLAGBLOCK; } +static inline xfs_nlink_t +xfs_refc_merge_refcount( + const struct xfs_refcount_irec *irec, + enum xfs_refc_adjust_op adjust) +{ + /* Once a record hits MAXREFCOUNT, it is pinned there forever */ + if (irec->rc_refcount == MAXREFCOUNT) + return MAXREFCOUNT; + return irec->rc_refcount + adjust; +} + +static inline bool +xfs_refc_want_merge_center( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + bool cleft_is_cright, + enum xfs_refc_adjust_op adjust, + unsigned long long *ulenp) +{ + unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; + + /* + * To merge with a center record, both shoulder records must be + * adjacent to the record we want to adjust. This is only true if + * find_left and find_right made all four records valid. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(right) || + !xfs_refc_valid(cleft) || !xfs_refc_valid(cright)) + return false; + + /* There must only be one record for the entire range. */ + if (!cleft_is_cright) + return false; + + /* The shoulder record refcounts must match the new refcount. */ + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) + return false; + if (right->rc_refcount != new_refcount) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount + right->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + *ulenp = ulen; + return true; +} + +static inline bool +xfs_refc_want_merge_left( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; + + /* + * For a left merge, the left shoulder record must be adjacent to the + * start of the range. If this is true, find_left made left and cleft + * contain valid contents. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft)) + return false; + + /* Left shoulder record refcount must match the new refcount. */ + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + +static inline bool +xfs_refc_want_merge_right( + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = right->rc_blockcount; + xfs_nlink_t new_refcount; + + /* + * For a right merge, the right shoulder record must be adjacent to the + * end of the range. If this is true, find_right made cright and right + * contain valid contents. + */ + if (!xfs_refc_valid(right) || !xfs_refc_valid(cright)) + return false; + + /* Right shoulder record refcount must match the new refcount. */ + new_refcount = xfs_refc_merge_refcount(cright, adjust); + if (right->rc_refcount != new_refcount) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cright->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + /* * Try to merge with any extents on the boundaries of the adjustment range. */ @@ -861,23 +986,15 @@ xfs_refcount_merge_extents( (cleft.rc_blockcount == cright.rc_blockcount); /* Try to merge left, cleft, and right. cleft must == cright. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + - right.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && - xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && - left.rc_refcount == cleft.rc_refcount + adjust && - right.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal, + adjust, &ulen)) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, &right, ulen, aglen); } /* Try to merge left and cleft. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && - left.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_left(&left, &cleft, adjust)) { *shape_changed = true; error = xfs_refcount_merge_left_extent(cur, &left, &cleft, agbno, aglen); @@ -893,10 +1010,7 @@ xfs_refcount_merge_extents( } /* Try to merge cright and right. */ - ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; - if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && - right.rc_refcount == cright.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_right(&cright, &right, adjust)) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, aglen); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a20cade590e9..1eeecf2eb2a7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -972,7 +972,9 @@ xfs_log_sb( */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_ifree = min_t(uint64_t, + percpu_counter_sum(&mp->m_ifree), + mp->m_sb.sb_icount); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); } diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index b7b838bd4ba4..4dd52b15f09c 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -609,9 +609,16 @@ xchk_agf( /* AGFL */ struct xchk_agfl_info { - unsigned int sz_entries; + /* Number of AGFL entries that the AGF claims are in use. */ + unsigned int agflcount; + + /* Number of AGFL entries that we found. */ unsigned int nr_entries; + + /* Buffer to hold AGFL entries for extent checking. */ xfs_agblock_t *entries; + + struct xfs_buf *agfl_bp; struct xfs_scrub *sc; }; @@ -641,10 +648,10 @@ xchk_agfl_block( struct xfs_scrub *sc = sai->sc; if (xfs_verify_agbno(sc->sa.pag, agbno) && - sai->nr_entries < sai->sz_entries) + sai->nr_entries < sai->agflcount) sai->entries[sai->nr_entries++] = agbno; else - xchk_block_set_corrupt(sc, sc->sa.agfl_bp); + xchk_block_set_corrupt(sc, sai->agfl_bp); xchk_agfl_block_xref(sc, agbno); @@ -696,19 +703,26 @@ int xchk_agfl( struct xfs_scrub *sc) { - struct xchk_agfl_info sai; + struct xchk_agfl_info sai = { + .sc = sc, + }; struct xfs_agf *agf; xfs_agnumber_t agno = sc->sm->sm_agno; - unsigned int agflcount; unsigned int i; int error; + /* Lock the AGF and AGI so that nobody can touch this AG. */ error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) - goto out; + return error; if (!sc->sa.agf_bp) return -EFSCORRUPTED; - xchk_buffer_recheck(sc, sc->sa.agfl_bp); + + /* Try to read the AGFL, and verify its structure if we get it. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp); + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + return error; + xchk_buffer_recheck(sc, sai.agfl_bp); xchk_agfl_xref(sc); @@ -717,24 +731,21 @@ xchk_agfl( /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = sc->sa.agf_bp->b_addr; - agflcount = be32_to_cpu(agf->agf_flcount); - if (agflcount > xfs_agfl_size(sc->mp)) { + sai.agflcount = be32_to_cpu(agf->agf_flcount); + if (sai.agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } - memset(&sai, 0, sizeof(sai)); - sai.sc = sc; - sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, - KM_MAYFAIL); + sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), + XCHK_GFP_FLAGS); if (!sai.entries) { error = -ENOMEM; goto out; } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, - sc->sa.agfl_bp, xchk_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp, + xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; goto out_free; @@ -742,7 +753,7 @@ xchk_agfl( if (error) goto out_free; - if (agflcount != sai.nr_entries) { + if (sai.agflcount != sai.nr_entries) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out_free; } @@ -758,7 +769,7 @@ xchk_agfl( } out_free: - kmem_free(sai.entries); + kvfree(sai.entries); out: return error; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 1b0b4e243f77..d75d82151eeb 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -442,12 +442,18 @@ xrep_agf( /* AGFL */ struct xrep_agfl { + /* Bitmap of alleged AGFL blocks that we're not going to add. */ + struct xbitmap crossed; + /* Bitmap of other OWN_AG metadata blocks. */ struct xbitmap agmetablocks; /* Bitmap of free space. */ struct xbitmap *freesp; + /* rmapbt cursor for finding crosslinked blocks */ + struct xfs_btree_cur *rmap_cur; + struct xfs_scrub *sc; }; @@ -477,6 +483,41 @@ xrep_agfl_walk_rmap( return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } +/* Strike out the blocks that are cross-linked according to the rmapbt. */ +STATIC int +xrep_agfl_check_extent( + struct xrep_agfl *ra, + uint64_t start, + uint64_t len) +{ + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t last_agbno = agbno + len - 1; + int error; + + ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); + + while (agbno <= last_agbno) { + bool other_owners; + + error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1, + &XFS_RMAP_OINFO_AG, &other_owners); + if (error) + return error; + + if (other_owners) { + error = xbitmap_set(&ra->crossed, agbno, 1); + if (error) + return error; + } + + if (xchk_should_terminate(ra->sc, &error)) + return error; + agbno++; + } + + return 0; +} + /* * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce * which blocks belong to the AGFL. @@ -496,44 +537,58 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; + struct xbitmap_range *br, *n; int error; ra.sc = sc; ra.freesp = agfl_extents; xbitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; - xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); - xbitmap_destroy(&ra.agmetablocks); if (error) - return error; + goto out_bmp; + + /* Strike out the blocks that are cross-linked. */ + ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + for_each_xbitmap_extent(br, n, agfl_extents) { + error = xrep_agfl_check_extent(&ra, br->start, br->len); + if (error) + break; + } + xfs_btree_del_cursor(ra.rmap_cur, error); + if (error) + goto out_bmp; + error = xbitmap_disunion(agfl_extents, &ra.crossed); + if (error) + goto out_bmp; /* * Calculate the new AGFL size. If we found more blocks than fit in @@ -541,11 +596,10 @@ xrep_agfl_collect_blocks( */ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), xfs_agfl_size(mp)); - return 0; -err: +out_bmp: + xbitmap_destroy(&ra.crossed); xbitmap_destroy(&ra.agmetablocks); - xfs_btree_del_cursor(cur, error); return error; } @@ -631,7 +685,7 @@ xrep_agfl_init_header( if (br->len) break; list_del(&br->list); - kmem_free(br); + kfree(br); } /* Write new AGFL to disk. */ @@ -697,7 +751,6 @@ xrep_agfl( * freespace overflow to the freespace btrees. */ sc->sa.agf_bp = agf_bp; - sc->sa.agfl_bp = agfl_bp; error = xrep_roll_ag_trans(sc); if (error) goto err; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index b6f0c9f3f124..31529b9bf389 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -49,7 +49,7 @@ xchk_setup_xattr_buf( if (ab) { if (sz <= ab->sz) return 0; - kmem_free(ab); + kvfree(ab); sc->buf = NULL; } @@ -79,7 +79,8 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, + XCHK_GFP_FLAGS); if (error) return error; } @@ -138,8 +139,7 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -324,8 +324,7 @@ xchk_xattr_block( return 0; /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); if (error == -ENOMEM) return -EDEADLOCK; if (error) diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index b89bf9de9b1c..a255f09e9f0a 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "scrub/scrub.h" #include "scrub/bitmap.h" /* @@ -25,7 +26,7 @@ xbitmap_set( { struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); + bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); if (!bmr) return -ENOMEM; @@ -47,7 +48,7 @@ xbitmap_destroy( for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); - kmem_free(bmr); + kfree(bmr); } } @@ -174,15 +175,15 @@ xbitmap_disunion( /* Total overlap, just delete ex. */ lp = lp->next; list_del(&br->list); - kmem_free(br); + kfree(br); break; case 0: /* * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xbitmap_range), - KM_MAYFAIL); + new_br = kmalloc(sizeof(struct xbitmap_range), + XCHK_GFP_FLAGS); if (!new_br) { error = -ENOMEM; goto out; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f0b9cb6506fd..d50d0eab196a 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -90,6 +90,7 @@ xchk_setup_inode_bmap( struct xchk_bmap_info { struct xfs_scrub *sc; + struct xfs_iext_cursor icur; xfs_fileoff_t lastoff; bool is_rt; bool is_shared; @@ -146,6 +147,48 @@ xchk_bmap_get_rmap( return has_rmap; } +static inline bool +xchk_bmap_has_prev( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) + return false; + if (got.br_startoff + got.br_blockcount != irec->br_startoff) + return false; + if (got.br_startblock + got.br_blockcount != irec->br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + +static inline bool +xchk_bmap_has_next( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) + return false; + if (irec->br_startoff + irec->br_blockcount != got.br_startoff) + return false; + if (irec->br_startblock + irec->br_blockcount != got.br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + /* Make sure that we have rmapbt records for this extent. */ STATIC void xchk_bmap_xref_rmap( @@ -214,6 +257,34 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + + /* + * If the rmap starts before this bmbt record, make sure there's a bmbt + * record for the previous offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && + !xchk_bmap_has_prev(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } + + /* + * If the rmap ends after this bmbt record, make sure there's a bmbt + * record for the next offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (info->whichfork != XFS_COW_FORK && + rmap_end > agbno + irec->br_blockcount && + !xchk_bmap_has_next(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } } /* Cross-reference a single rtdev extent record. */ @@ -264,6 +335,8 @@ xchk_bmap_iextent_xref( case XFS_COW_FORK: xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); break; } @@ -297,14 +370,13 @@ xchk_bmap_dirattr_extent( } /* Scrub a single extent record. */ -STATIC int +STATIC void xchk_bmap_iextent( struct xfs_inode *ip, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - int error = 0; /* * Check for out-of-order extents. This record could have come @@ -325,14 +397,6 @@ xchk_bmap_iextent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check for delalloc extents. We never iterate the ones in the - * in-core extent scan, and we should never see these in the bmbt. - */ - if (isnullstartblock(irec->br_startblock)) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -353,15 +417,12 @@ xchk_bmap_iextent( irec->br_startoff); if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return 0; + return; if (info->is_rt) xchk_bmap_rt_iextent_xref(ip, info, irec); else xchk_bmap_iextent_xref(ip, info, irec); - - info->lastoff = irec->br_startoff + irec->br_blockcount; - return error; } /* Scrub a bmbt record. */ @@ -599,14 +660,41 @@ xchk_bmap_check_rmaps( for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); - if (error) - break; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - break; + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_perag_put(pag); + return error; + } } - if (pag) - xfs_perag_put(pag); - return error; + + return 0; +} + +/* Scrub a delalloc reservation from the incore extent map tree. */ +STATIC void +xchk_bmap_iextent_delalloc( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->lastoff) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); } /* @@ -626,7 +714,6 @@ xchk_bmap( struct xfs_inode *ip = sc->ip; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; - struct xfs_iext_cursor icur; int error = 0; /* Non-existent forks can be ignored. */ @@ -661,6 +748,8 @@ xchk_bmap( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: /* No mappings to check. */ + if (whichfork == XFS_COW_FORK) + xchk_fblock_set_corrupt(sc, whichfork, 0); goto out; case XFS_DINODE_FMT_EXTENTS: break; @@ -690,20 +779,22 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &icur, &irec) { + for_each_xfs_iext(ifp, &info.icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; - if (isnullstartblock(irec.br_startblock)) - continue; + if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); goto out; } - error = xchk_bmap_iextent(ip, &info, &irec); - if (error) - goto out; + + if (isnullstartblock(irec.br_startblock)) + xchk_bmap_iextent_delalloc(ip, &info, &irec); + else + xchk_bmap_iextent(ip, &info, &irec); + info.lastoff = irec.br_startoff + irec.br_blockcount; } error = xchk_bmap_check_rmaps(sc, whichfork); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 2f4519590dc1..0fd36d5b4646 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -408,7 +408,6 @@ xchk_btree_check_owner( struct xfs_buf *bp) { struct xfs_btree_cur *cur = bs->cur; - struct check_owner *co; /* * In theory, xfs_btree_get_block should only give us a null buffer @@ -431,10 +430,13 @@ xchk_btree_check_owner( * later scanning. */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { - co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL); + struct check_owner *co; + + co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); if (!co) return -ENOMEM; + + INIT_LIST_HEAD(&co->list); co->level = level; co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); @@ -649,7 +651,7 @@ xchk_btree( xchk_btree_set_corrupt(sc, cur, 0); return 0; } - bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL); + bs = kzalloc(cur_sz, XCHK_GFP_FLAGS); if (!bs) return -ENOMEM; bs->cur = cur; @@ -740,9 +742,9 @@ xchk_btree( error = xchk_btree_check_block_owner(bs, co->level, co->daddr); list_del(&co->list); - kmem_free(co); + kfree(co); } - kmem_free(bs); + kfree(bs); return error; } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 9bbbf20f401b..613260b04a3d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -424,10 +424,6 @@ xchk_ag_read_headers( if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); - if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) - return error; - return 0; } @@ -515,10 +511,6 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); - if (sa->agfl_bp) { - xfs_trans_brelse(sc->tp, sa->agfl_bp); - sa->agfl_bp = NULL; - } if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; @@ -789,6 +781,33 @@ xchk_buffer_recheck( trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); } +static inline int +xchk_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + int error; + + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + break; + } + + sc->sm->sm_type = smtype; + return error; +} + /* * Scrub the attr/data forks of a metadata inode. The metadata inode must be * pointed to by sc->ip and the ILOCK must be held. @@ -797,13 +816,17 @@ int xchk_metadata_inode_forks( struct xfs_scrub *sc) { - __u32 smtype; bool shared; int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + /* Check the inode record. */ + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + /* Metadata inodes don't live on the rt device. */ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -823,10 +846,7 @@ xchk_metadata_inode_forks( } /* Invoke the data fork scrubber. */ - smtype = sc->sm->sm_type; - sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; - error = xchk_bmap_data(sc); - sc->sm->sm_type = smtype; + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; @@ -841,7 +861,7 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } - return error; + return 0; } /* diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 454145db10e7..b73648d81d23 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,7 +25,7 @@ xchk_should_terminate( if (fatal_signal_pending(current)) { if (*error == 0) - *error = -EAGAIN; + *error = -EINTR; return true; } return false; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 84fe3d33d699..d17cee177085 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -486,7 +486,7 @@ xchk_da_btree( return 0; /* Set up initial da state. */ - ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL); + ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS); if (!ds) return -ENOMEM; ds->dargs.dp = sc->ip; @@ -591,6 +591,6 @@ xchk_da_btree( out_state: xfs_da_state_free(ds->state); - kmem_free(ds); + kfree(ds); return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5c87800ab223..d1b0f23c2c59 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -666,7 +666,12 @@ xchk_directory_blocks( struct xfs_scrub *sc) { struct xfs_bmbt_irec got; - struct xfs_da_args args; + struct xfs_da_args args = { + .dp = sc ->ip, + .whichfork = XFS_DATA_FORK, + .geo = sc->mp->m_dir_geo, + .trans = sc->tp, + }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; @@ -689,9 +694,6 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - args.dp = sc->ip; - args.geo = mp->m_dir_geo; - args.trans = sc->tp; error = xfs_dir2_isblock(&args, &is_block); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 6a6f8fe7f87c..4777e7b89fdc 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -14,6 +14,8 @@ #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -43,6 +45,16 @@ * our tolerance for mismatch between expected and actual counter values. */ +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; +}; + /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -116,10 +128,11 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; fsc = sc->buf; + fsc->sc = sc; xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); @@ -138,6 +151,18 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); } +/* + * Part 1: Collecting filesystem summary counts. For each AG, we add its + * summary counts (total inodes, free inodes, free data blocks) to an incore + * copy of the overall filesystem summary counts. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ static int xchk_fscount_btreeblks( @@ -225,8 +250,10 @@ xchk_fscount_aggregate_agcounts( } if (pag) xfs_perag_put(pag); - if (error) + if (error) { + xchk_set_incomplete(sc); return error; + } /* * The global incore space reservation is taken from the incore @@ -267,6 +294,64 @@ xchk_fscount_aggregate_agcounts( return 0; } +#ifdef CONFIG_XFS_RT +STATIC int +xchk_fscount_add_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xchk_fscounters *fsc = priv; + int error = 0; + + fsc->frextents += rec->ar_extcount; + + xchk_should_terminate(fsc->sc, &error); + return error; +} + +/* Calculate the number of free realtime extents from the realtime bitmap. */ +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + int error; + + fsc->frextents = 0; + if (!xfs_has_realtime(mp)) + return 0; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, + xchk_fscount_add_frextent, fsc); + if (error) { + xchk_set_incomplete(sc); + goto out_unlock; + } + +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} +#else +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + fsc->frextents = 0; + return 0; +} +#endif /* CONFIG_XFS_RT */ + +/* + * Part 2: Comparing filesystem summary counters. All we have to do here is + * sum the percpu counters and compare them to what we've observed. + */ + /* * Is the @counter reasonably close to the @expected value? * @@ -333,16 +418,17 @@ xchk_fscounters( { struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; - int64_t icount, ifree, fdblocks; + int64_t icount, ifree, fdblocks, frextents; int error; /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); fdblocks = percpu_counter_sum(&mp->m_fdblocks); + frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0) + if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) xchk_set_corrupt(sc); /* See if icount is obviously wrong. */ @@ -353,6 +439,10 @@ xchk_fscounters( if (fdblocks > mp->m_sb.sb_dblocks) xchk_set_corrupt(sc); + /* See if frextents is obviously wrong. */ + if (frextents > mp->m_sb.sb_rextents) + xchk_set_corrupt(sc); + /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. @@ -367,6 +457,13 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; + /* Count the free extents counter for rt volumes. */ + error = xchk_fscount_count_frextents(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + /* Compare the in-core counters with whatever we counted. */ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) xchk_set_corrupt(sc); @@ -378,5 +475,9 @@ xchk_fscounters( fsc->fdblocks)) xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + fsc->frextents)) + xchk_set_corrupt(sc); + return 0; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 51820b40ab1c..7a2f38e5202c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -365,7 +365,7 @@ xchk_dinode( * pagecache can't cache all the blocks in this file due to * overly large offsets, flag the inode for admin review. */ - if (isize >= mp->m_super->s_maxbytes) + if (isize > mp->m_super->s_maxbytes) xchk_ino_set_warning(sc, ino); /* di_nblocks */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 21b4c9006859..9eeac8565394 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -14,6 +14,7 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -84,7 +85,7 @@ xchk_quota_item( int error = 0; if (xchk_should_terminate(sc, &error)) - return -ECANCELED; + return error; /* * Except for the root dquot, the actual dquot we got must either have @@ -189,11 +190,12 @@ xchk_quota_data_fork( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; + /* - * delalloc extents or blocks mapped above the highest + * delalloc/unwritten extents or blocks mapped above the highest * quota id shouldn't happen. */ - if (isnullstartblock(irec.br_startblock) || + if (!xfs_bmap_is_written_extent(&irec) || irec.br_startoff > max_dqid_off || irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index a26ee0f24ef2..d9c1b3cea4a5 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check( * is healthy each rmap_irec we see will be in agbno order * so we don't need insertion sort here. */ - frag = kmem_alloc(sizeof(struct xchk_refcnt_frag), - KM_MAYFAIL); + frag = kmalloc(sizeof(struct xchk_refcnt_frag), + XCHK_GFP_FLAGS); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments( continue; } list_del(&frag->list); - kmem_free(frag); + kfree(frag); nr++; } @@ -257,11 +257,11 @@ xchk_refcountbt_process_rmap_fragments( /* Delete fragments and work list. */ list_for_each_entry_safe(frag, n, &worklist, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } list_for_each_entry_safe(frag, n, &refchk->fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } @@ -306,7 +306,7 @@ xchk_refcountbt_xref_rmap( out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c18bd039fce9..4b92f9253ccd 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -61,7 +61,6 @@ xrep_attempt( sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: - case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; @@ -70,10 +69,15 @@ xrep_attempt( /* * We tried harder but still couldn't grab all the resources * we needed to fix it. The corruption has not been fixed, - * so report back to userspace. + * so exit to userspace with the scan's output flags unchanged. */ - return -EFSCORRUPTED; + return 0; default: + /* + * EAGAIN tells the caller to re-scrub, so we cannot return + * that here. + */ + ASSERT(error != -EAGAIN); return error; } } @@ -121,32 +125,40 @@ xrep_roll_ag_trans( { int error; - /* Keep the AG header buffers locked so we can keep going. */ - if (sc->sa.agi_bp) + /* + * Keep the AG header buffers locked while we roll the transaction. + * Ensure that both AG buffers are dirty and held when we roll the + * transaction so that they move forward in the log without losing the + * bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); + } /* - * Roll the transaction. We still own the buffer and the buffer lock - * regardless of whether or not the roll succeeds. If the roll fails, - * the buffers will be released during teardown on our way out of the - * kernel. If it succeeds, we join them to the new transaction and - * move on. + * Roll the transaction. We still hold the AG header buffers locked + * regardless of whether or not that succeeds. On failure, the buffers + * will be released during teardown on our way out of the kernel. If + * successful, join the buffers to the new transaction and move on. */ error = xfs_trans_roll(&sc->tp); if (error) return error; - /* Join AG headers to the new transaction. */ + /* Join the AG headers to the new transaction. */ if (sc->sa.agi_bp) xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; } @@ -498,6 +510,7 @@ xrep_put_freelist( struct xfs_scrub *sc, xfs_agblock_t agbno) { + struct xfs_buf *agfl_bp; int error; /* Make sure there's space on the freelist. */ @@ -516,8 +529,12 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - sc->sa.agfl_bp, agbno, 0); + agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 2e8e400f10a9..07a7a75f987f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -174,7 +174,7 @@ xchk_teardown( if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->buf) { - kmem_free(sc->buf); + kvfree(sc->buf); sc->buf = NULL; } return error; @@ -467,7 +467,7 @@ xfs_scrub_metadata( xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); - sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); + sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; goto out; @@ -557,7 +557,7 @@ xfs_scrub_metadata( out_teardown: error = xchk_teardown(sc, error); out_sc: - kmem_free(sc); + kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 3de5287e98d8..b4d391b4c938 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,15 @@ struct xfs_scrub; +/* + * Standard flags for allocating memory within scrub. NOFS context is + * configured by the process allocation scope. Scrub and repair must be able + * to back out gracefully if there isn't enough memory. Force-cast to avoid + * complaints from static checkers. + */ +#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ + __GFP_RETRY_MAYFAIL)) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ @@ -39,7 +48,6 @@ struct xchk_ag { /* AG btree roots */ struct xfs_buf *agf_bp; - struct xfs_buf *agfl_bp; struct xfs_buf *agi_bp; /* AG btrees */ @@ -161,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif -struct xchk_fscounters { - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - unsigned long long icount_min; - unsigned long long icount_max; -}; - #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 75311f8daeeb..c1c99ffe7408 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -21,7 +21,7 @@ xchk_setup_symlink( struct xfs_scrub *sc) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5d1a995b15f8..41734202796f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -17,6 +17,8 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_reflink.h" +#include "xfs_errortag.h" +#include "xfs_error.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -114,9 +116,8 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, offset), - XFS_B_TO_FSB(mp, size)); + xfs_bmap_punch_delalloc_range(ip, offset, + offset + size); } goto done; } @@ -218,11 +219,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { + trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); return false; + } if (xfs_inode_has_cow_data(ip) && - XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { + trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); return false; + } return true; } @@ -286,6 +293,8 @@ xfs_map_blocks( if (xfs_is_shutdown(mp)) return -EIO; + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + /* * COW fork blocks can overlap data fork blocks even if the blocks * aren't shared. COW I/O always takes precedent, so we must always @@ -373,7 +382,7 @@ xfs_map_blocks( isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -455,12 +464,8 @@ xfs_discard_folio( struct folio *folio, loff_t pos) { - struct inode *inode = folio->mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - size_t offset = offset_in_folio(folio, pos); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) @@ -470,8 +475,9 @@ xfs_discard_folio( "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", folio, ip->i_ino, pos); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_folio(inode, folio) - pageoff_fsb); + error = xfs_bmap_punch_delalloc_range(ip, pos, + round_up(pos, folio_size(folio))); + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 04d0c2bff67c..867645b74d88 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -590,11 +590,13 @@ xfs_getbmap( int xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) + xfs_off_t start_byte, + xfs_off_t end_byte) { + struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = &ip->i_df; - xfs_fileoff_t end_fsb = start_fsb + length; + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range( while (got.br_startoff + got.br_blockcount > start_fsb) { del = got; - xfs_trim_extent(&del, start_fsb, length); + xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); /* * A delete can push the cursor forward. Step back to the diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 24b37d211f1d..6888078f5c31 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); + xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index dde346450952..54c774af6e1c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1945,6 +1945,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 522d450a94b1..df7322ed73fa 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1018,6 +1018,8 @@ xfs_buf_item_relse( trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + if (atomic_read(&bip->bli_refcount)) + return; bp->b_log_item = NULL; xfs_buf_rele(bp); xfs_buf_item_free(bip); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c6b2aabd6f18..713341d246d1 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, - XFS_RANDOM_DROP_WRITES, + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, @@ -60,6 +60,8 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, + XFS_RANDOM_WB_DELAY_MS, + XFS_RANDOM_WRITE_DELAY_MS, }; struct xfs_errortag_attr { @@ -162,7 +164,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); @@ -176,6 +177,8 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); +XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -206,7 +209,6 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), @@ -220,6 +222,8 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), + XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), + XFS_ERRORTAG_ATTR_LIST(write_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); @@ -256,6 +260,32 @@ xfs_errortag_del( kmem_free(mp->m_errortag); } +static bool +xfs_errortag_valid( + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return false; + + /* Error out removed injection types */ + if (error_tag == XFS_ERRTAG_DROP_WRITES) + return false; + return true; +} + +bool +xfs_errortag_enabled( + struct xfs_mount *mp, + unsigned int tag) +{ + if (!mp->m_errortag) + return false; + if (!xfs_errortag_valid(tag)) + return false; + + return mp->m_errortag[tag] != 0; +} + bool xfs_errortag_test( struct xfs_mount *mp, @@ -277,7 +307,9 @@ xfs_errortag_test( if (!mp->m_errortag) return false; - ASSERT(error_tag < XFS_ERRTAG_MAX); + if (!xfs_errortag_valid(error_tag)) + return false; + randfactor = mp->m_errortag[error_tag]; if (!randfactor || prandom_u32_max(randfactor)) return false; @@ -293,7 +325,7 @@ xfs_errortag_get( struct xfs_mount *mp, unsigned int error_tag) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return mp->m_errortag[error_tag]; @@ -305,7 +337,7 @@ xfs_errortag_set( unsigned int error_tag, unsigned int tag_value) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; mp->m_errortag[error_tag] = tag_value; @@ -319,7 +351,7 @@ xfs_errortag_add( { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return xfs_errortag_set(mp, error_tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5191e9145e55..dbe6c37dc697 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, const char *file, int line, unsigned int error_tag); #define XFS_TEST_ERROR(expr, mp, tag) \ ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); +#define XFS_ERRORTAG_DELAY(mp, tag) \ + do { \ + might_sleep(); \ + if (!xfs_errortag_enabled((mp), (tag))) \ + break; \ + xfs_warn_ratelimited((mp), \ +"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \ + (mp)->m_errortag[(tag)], __FILE__, __LINE__, \ + (mp)->m_super->s_id); \ + mdelay((mp)->m_errortag[(tag)]); \ + } while (0) extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, @@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) #define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) #define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ad22a003f959..f3d328e4a440 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -236,6 +236,7 @@ xfs_extent_busy_update_extent( * */ busyp->bno = fend; + busyp->length = bend - fend; } else if (bbno < fbno) { /* * Case 8: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e462d39c840e..595a5bcf46b9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1325,7 +1325,7 @@ __xfs_filemap_fault( if (write_fault) { xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, - &xfs_buffered_write_iomap_ops); + &xfs_page_mkwrite_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { ret = filemap_fault(vmf); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index d8337274c74d..88a88506ffff 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_mount *mp = tp->t_mountp; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); /* * Set up query parameters to return free rtextents covering the range @@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( if (error) goto err; err: - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eae7427062cf..ddeaccc04aec 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -342,6 +342,9 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return -EAGAIN; + /* * We need to make it look like the inode is being reclaimed to prevent * the actual reclaim workers from stomping over us while we recycle @@ -355,6 +358,7 @@ xfs_iget_recycle( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { /* * Re-initializing the inode failed, and we are in deep @@ -518,6 +522,8 @@ xfs_iget_cache_hit( if (ip->i_flags & XFS_IRECLAIMABLE) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); + if (error == -EAGAIN) + goto out_skip; if (error) return error; } else { @@ -1847,12 +1853,20 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + unsigned int nofs_flag; WRITE_ONCE(gc->items, 0); if (!node) return; + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + ip = llist_entry(node, struct xfs_inode, i_gclist); trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); @@ -1861,6 +1875,8 @@ xfs_inodegc_worker( xfs_iflags_set(ip, XFS_INACTIVATING); xfs_inodegc_inactivate(ip); } + + memalloc_nofs_restore(nofs_flag); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index aa303be11576..d354ea2b74f9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2479,7 +2479,7 @@ xfs_remove( error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, tp->t_mountp->m_sb.sb_rootino, 0); if (error) - return error; + goto out_trans_cancel; } } else { /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f783e979629..85fbb3b71d1c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -754,7 +754,7 @@ xfs_bulkstat_fmt( static int xfs_bulk_ireq_setup( struct xfs_mount *mp, - struct xfs_bulk_ireq *hdr, + const struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq, void __user *ubuffer) { @@ -780,7 +780,7 @@ xfs_bulk_ireq_setup( switch (hdr->ino) { case XFS_BULK_IREQ_SPECIAL_ROOT: - hdr->ino = mp->m_sb.sb_rootino; + breq->startino = mp->m_sb.sb_rootino; break; default: return -EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..43f447199c08 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -48,13 +48,53 @@ xfs_alert_fsblock_zero( return -EFSCORRUPTED; } +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_af.if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (iomap->validity_cookie != + xfs_iomap_inode_sequence(ip, iomap->flags)) { + trace_xfs_iomap_invalid(ip, iomap); + return false; + } + + XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); + return true; +} + +const struct iomap_page_ops xfs_iomap_page_ops = { + .iomap_valid = xfs_iomap_valid, +}; + int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags) + u16 iomap_flags, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -91,6 +131,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; + iomap->page_ops = &xfs_iomap_page_ops; return 0; } @@ -195,7 +238,8 @@ xfs_iomap_write_direct( xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -285,6 +329,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -743,6 +788,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -811,9 +857,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; } + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; @@ -839,24 +886,26 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - flags, &imap); + flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - iomap_flags | IOMAP_F_NEW); + iomap_flags | IOMAP_F_NEW, seq); out_found_cow: - xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + seq = xfs_iomap_inode_sequence(ip, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) @@ -915,6 +964,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -926,6 +976,10 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); + error = xfs_qm_dqattach(ip); + if (error) + return error; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -1029,10 +1083,6 @@ xfs_buffered_write_iomap_begin( allocfork = XFS_COW_FORK; } - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - if (eof && offset + count > XFS_ISIZE(ip)) { /* * Determine the initial size of the preallocation. @@ -1094,32 +1144,47 @@ xfs_buffered_write_iomap_begin( * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } +static int +xfs_buffered_write_delalloc_punch( + struct inode *inode, + loff_t offset, + loff_t length) +{ + return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, + offset + length); +} + static int xfs_buffered_write_iomap_end( struct inode *inode, @@ -1129,56 +1194,17 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error = 0; - - if (iomap->type != IOMAP_DELALLOC) - return 0; - - /* - * Behave as if the write failed if drop writes is enabled. Set the NEW - * flag to force delalloc cleanup. - */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { - iomap->flags |= IOMAP_F_NEW; - written = 0; - } - /* - * start_fsb refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_fsb = XFS_B_TO_FSBT(mp, offset); - else - start_fsb = XFS_B_TO_FSB(mp, offset + written); - end_fsb = XFS_B_TO_FSB(mp, offset + length); + struct xfs_mount *mp = XFS_M(inode->i_sb); + int error; - /* - * Trim delalloc blocks if they were allocated by this write and we - * didn't manage to write the whole range. - * - * We don't need to care about racing delalloc as we hold i_mutex - * across the reserve/allocate/unreserve calls. If there are delalloc - * blocks in the range, they are ours. - */ - if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); - return error; - } + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, + length, written, &xfs_buffered_write_delalloc_punch); + if (error && !xfs_is_shutdown(mp)) { + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", + __func__, XFS_I(inode)->i_ino); + return error; } - return 0; } @@ -1187,6 +1213,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; +/* + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents + * that it allocated to be revoked. Hence we do not need an .iomap_end method + * for this operation. + */ +const struct iomap_ops xfs_page_mkwrite_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, +}; + static int xfs_read_iomap_begin( struct inode *inode, @@ -1204,6 +1239,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1217,13 +1253,14 @@ xfs_read_iomap_begin( &nimaps, 0); if (!error && (flags & IOMAP_REPORT)) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - shared ? IOMAP_F_SHARED : 0); + shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1248,6 +1285,7 @@ xfs_seek_iomap_begin( struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1282,8 +1320,9 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1304,8 +1343,9 @@ xfs_seek_iomap_begin( imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1331,6 +1371,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1347,12 +1388,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c782e8c0479c..4da13440bae9 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,14 +13,15 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap); + struct xfs_bmbt_irec *imap, u64 *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); +u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags); + u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero); @@ -47,6 +48,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f02a0dd522b3..fc61cc024023 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -644,12 +644,14 @@ xfs_log_mount( int min_logfsbs; if (!xfs_has_norecovery(mp)) { - xfs_notice(mp, "Mounting V%d Filesystem", - XFS_SB_VERSION_NUM(&mp->m_sb)); + xfs_notice(mp, "Mounting V%d Filesystem %pU", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); } else { xfs_notice(mp, -"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", - XFS_SB_VERSION_NUM(&mp->m_sb)); +"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); ASSERT(xfs_is_readonly(mp)); } @@ -886,6 +888,23 @@ xlog_force_iclog( return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); } +/* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ +static void +xlog_wait_iclog_completion(struct xlog *log) +{ + int i; + struct xlog_in_core *iclog = log->l_iclog; + + for (i = 0; i < log->l_iclog_bufs; i++) { + down(&iclog->ic_sema); + up(&iclog->ic_sema); + iclog = iclog->ic_next; + } +} + /* * Wait for the iclog and all prior iclogs to be written disk as required by the * log force state machine. Waiting on ic_force_wait ensures iclog completions @@ -1111,6 +1130,14 @@ xfs_log_unmount( { xfs_log_clean(mp); + /* + * If shutdown has come from iclog IO context, the log + * cleaning will have been skipped and so we need to wait + * for the iclog to complete shutdown processing before we + * tear anything down. + */ + xlog_wait_iclog_completion(mp->m_log); + xfs_buftarg_drain(mp->m_ddev_targp); xfs_trans_ail_destroy(mp); @@ -2113,17 +2140,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - /* - * Cycle all the iclogbuf locks to make sure all log IO completion - * is done before we tear down these buffers. - */ - iclog = log->l_iclog; - for (i = 0; i < log->l_iclog_bufs; i++) { - down(&iclog->ic_sema); - up(&iclog->ic_sema); - iclog = iclog->ic_next; - } - /* * Destroy the CIL after waiting for iclog IO completion because an * iclog EIO error will try to shut down the log, which accesses the diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e8bb3c2e847e..fb87ffb48f7f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -538,6 +538,20 @@ xfs_check_summary_counts( return 0; } +static void +xfs_unmount_check( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return; + + if (percpu_counter_sum(&mp->m_ifree) > + percpu_counter_sum(&mp->m_icount)) { + xfs_alert(mp, "ifree/icount mismatch at unmount"); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); + } +} + /* * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, @@ -1077,6 +1091,7 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); + xfs_unmount_check(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 37a24f0f7cd4..38d23f0e703a 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -125,6 +125,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -176,6 +177,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); @@ -189,7 +191,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, 0, &imap); + end_fsb - offset_fsb, 0, &imap, &seq); if (error) goto out_unlock; @@ -209,7 +211,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 18bb4ec4d7c9..ff53d40a2dae 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -422,6 +422,14 @@ xfs_qm_dquot_isolate( if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; + /* + * If something else is freeing this dquot and hasn't yet removed it + * from the LRU, leave it for the freeing task to complete the freeing + * process rather than risk it being free from under us here. + */ + if (dqp->q_flags & XFS_DQFLAG_FREEING) + goto out_miss_unlock; + /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. @@ -441,10 +449,8 @@ xfs_qm_dquot_isolate( * skip it so there is time for the IO to complete before we try to * reclaim it again on the next LRU pass. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - goto out_miss_busy; - } + if (!xfs_dqflock_nowait(dqp)) + goto out_miss_unlock; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -478,6 +484,8 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; +out_miss_unlock: + xfs_dqunlock(dqp); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 292d5e54a92c..16534e9873f6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, &val); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); if (error) return error; @@ -1325,6 +1325,41 @@ xfs_rtalloc_reinit_frextents( return 0; } +/* + * Read in the bmbt of an rt metadata inode so that we never have to load them + * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use + * an empty transaction to avoid deadlocking on loops in the bmbt. + */ +static inline int +xfs_rtmount_iread_extents( + struct xfs_inode *ip, + unsigned int lock_class) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); + + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + if (xfs_inode_has_attr_fork(ip)) { + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + } + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); + xfs_trans_cancel(tp); + return error; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. @@ -1342,14 +1377,27 @@ xfs_rtmount_inodes( return error; ASSERT(mp->m_rbmip != NULL); + error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); + if (error) + goto out_rele_bitmap; + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); - if (error) { - xfs_irele(mp->m_rbmip); - return error; - } + if (error) + goto out_rele_bitmap; ASSERT(mp->m_rsumip != NULL); + + error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); + if (error) + goto out_rele_summary; + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; + +out_rele_summary: + xfs_irele(mp->m_rsumip); +out_rele_bitmap: + xfs_irele(mp->m_rbmip); + return error; } void diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ee4b429a2f2c..0c4b73e9b29d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1110,7 +1110,7 @@ xfs_fs_put_super( if (!sb->s_fs_info) return; - xfs_notice(mp, "Unmounting Filesystem"); + xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index d269ef57ff01..8a5dc1538aa8 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -34,6 +34,8 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_error.h" +#include +#include "xfs_iomap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 372d871bccc5..421d1e504ac4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3352,6 +3352,92 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ TP_ARGS(ip, irec)) +/* inode iomap invalidation events */ +DECLARE_EVENT_CLASS(xfs_wb_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), + TP_ARGS(ip, iomap, wpcseq, whichfork), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u16, type) + __field(u16, flags) + __field(u32, wpcseq) + __field(u32, forkseq) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->wpcseq = wpcseq; + __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->wpcseq, + __entry->forkseq) +); +#define DEFINE_WB_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_wb_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \ + TP_ARGS(ip, iomap, wpcseq, whichfork)) +DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); +DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); + +DECLARE_EVENT_CLASS(xfs_iomap_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), + TP_ARGS(ip, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u64, validity_cookie) + __field(u64, inodeseq) + __field(u16, type) + __field(u16, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->validity_cookie = iomap->validity_cookie; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->validity_cookie, + __entry->inodeseq) +); +#define DEFINE_IOMAP_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_iomap_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \ + TP_ARGS(ip, iomap)) +DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid); + /* refcount/reflink tracepoint definitions */ /* reflink tracepoints */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index f51df7d94ef7..7d4109af193e 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -422,7 +422,7 @@ xfsaild_push( struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; - xfs_lsn_t target; + xfs_lsn_t target = NULLCOMMITLSN; long tout; int stuck = 0; int flushing = 0; @@ -472,6 +472,8 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail); + ASSERT(target != NULLCOMMITLSN); + lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index c325a28b89a8..10aa1fd39d2b 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -210,7 +210,7 @@ __xfs_xattr_put_listent( return; } offset = context->buffer + context->count; - strncpy(offset, prefix, prefix_len); + memcpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 238a03087e17..0983dfc9a203 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -49,26 +49,35 @@ struct vm_fault; * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. */ -#define IOMAP_F_NEW 0x01 -#define IOMAP_F_DIRTY 0x02 -#define IOMAP_F_SHARED 0x04 -#define IOMAP_F_MERGED 0x08 -#define IOMAP_F_BUFFER_HEAD 0x10 -#define IOMAP_F_ZONE_APPEND 0x20 +#define IOMAP_F_NEW (1U << 0) +#define IOMAP_F_DIRTY (1U << 1) +#define IOMAP_F_SHARED (1U << 2) +#define IOMAP_F_MERGED (1U << 3) +#define IOMAP_F_BUFFER_HEAD (1U << 4) +#define IOMAP_F_ZONE_APPEND (1U << 5) +#define IOMAP_F_XATTR (1U << 6) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. + * + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. */ -#define IOMAP_F_SIZE_CHANGED 0x100 +#define IOMAP_F_SIZE_CHANGED (1U << 8) +#define IOMAP_F_STALE (1U << 9) /* * Flags from 0x1000 up are for file system specific usage: */ -#define IOMAP_F_PRIVATE 0x1000 +#define IOMAP_F_PRIVATE (1U << 12) /* @@ -89,6 +98,7 @@ struct iomap { void *inline_data; void *private; /* filesystem private */ const struct iomap_page_ops *page_ops; + u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) @@ -128,6 +138,23 @@ struct iomap_page_ops { int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* @@ -226,6 +253,10 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)); + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -- 2.40.0 From 4c93d4c44759855228ea9b96e57a88fe097f6b77 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 10 Mar 2023 19:32:13 +0100 Subject: [PATCH 08/17] Implement amd-pstate-epp and amd-pstate-guided Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 33 +- Documentation/admin-guide/pm/amd-pstate.rst | 95 ++- drivers/acpi/cppc_acpi.c | 188 ++++- drivers/cpufreq/amd-pstate.c | 794 +++++++++++++++++- drivers/cpufreq/brcmstb-avs-cpufreq.c | 5 +- drivers/cpufreq/cpufreq.c | 8 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 4 +- drivers/cpufreq/omap-cpufreq.c | 4 +- drivers/cpufreq/qcom-cpufreq-hw.c | 4 +- include/acpi/cppc_acpi.h | 23 + include/linux/amd-pstate.h | 34 + include/linux/cpufreq.h | 2 +- 12 files changed, 1136 insertions(+), 58 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0cef8c4c1f28..f2a13c7e73a1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -339,6 +339,29 @@ This mode requires kvm-amd.avic=1. (Default when IOMMU HW support is present.) + amd_pstate= [X86] + disable + Do not enable amd_pstate as the default + scaling driver for the supported processors + passive + Use amd_pstate with passive mode as a scaling driver. + In this mode autonomous selection is disabled. + Driver requests a desired performance level and platform + tries to match the same performance level if it is + satisfied by guaranteed performance level. + active + Use amd_pstate_epp driver instance as the scaling driver, + driver provides a hint to the hardware if software wants + to bias toward performance (0x0) or energy efficiency (0xff) + to the CPPC firmware. then CPPC power algorithm will + calculate the runtime workload and adjust the realtime cores + frequency. + guided + Activate guided autonomous mode. Driver requests minimum and + maximum performance level and the platform autonomously + selects a performance level in this range and appropriate + to the current workload. + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , @@ -6977,13 +7000,3 @@ xmon commands. off xmon is disabled. - amd_pstate= [X86] - disable - Do not enable amd_pstate as the default - scaling driver for the supported processors - passive - Use amd_pstate as a scaling driver, driver requests a - desired performance on this abstract scale and the power - management firmware translates the requests into actual - hardware states (core frequency, data fabric and memory - clocks etc.) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 06e23538f79c..2272bdb9d55b 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the request from AMD P-States. -User Space Interface in ``sysfs`` -================================== +User Space Interface in ``sysfs`` - Per-policy control +====================================================== ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to control its functionality at the system level. They are located in the @@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability `_.) This attribute is read-only. +``energy_performance_available_preferences`` + +A list of all the supported EPP preferences that could be used for +``energy_performance_preference`` on this system. +These profiles represent different hints that are provided +to the low-level firmware about the user's desired energy vs efficiency +tradeoff. ``default`` represents the epp value is set by platform +firmware. This attribute is read-only. + +``energy_performance_preference`` + +The current energy performance preference can be read from this attribute. +and user can change current preference according to energy or performance needs +Please get all support profiles list from +``energy_performance_available_preferences`` attribute, all the profiles are +integer values defined between 0 to 255 when EPP feature is enabled by platform +firmware, if EPP feature is disabled, driver will ignore the written value +This attribute is read-write. + Other performance and frequency values can be read back from ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. @@ -280,8 +299,35 @@ module which supports the new AMD P-States mechanism on most of the future AMD platforms. The AMD P-States mechanism is the more performance and energy efficiency frequency management method on AMD processors. -Kernel Module Options for ``amd-pstate`` -========================================= + +AMD Pstate Driver Operation Modes +================================= + +``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, +non-autonomous (passive) mode and guided autonomous (guided) mode. +Active/passive/guided mode can be chosen by different kernel parameters. + +- In autonomous mode, platform ignores the desired performance level request + and takes into account only the values set to the minimum, maximum and energy + performance preference registers. +- In non-autonomous mode, platform gets desired performance level + from OS directly through Desired Performance Register. +- In guided-autonomous mode, platform sets operating performance level + autonomously according to the current workload and within the limits set by + OS through min and max performance registers. + +Active Mode +------------ + +``amd_pstate=active`` + +This is the low-level firmware control mode which is implemented by ``amd_pstate_epp`` +driver with ``amd_pstate=active`` passed to the kernel in the command line. +In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software +wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware. +then CPPC power algorithm will calculate the runtime workload and adjust the realtime +cores frequency according to the power supply and thermal, core voltage and some other +hardware conditions. Passive Mode ------------ @@ -297,6 +343,47 @@ to the Performance Reduction Tolerance register. Above the nominal performance l processor must provide at least nominal performance requested and go higher if current operating conditions allow. +Guided Mode +----------- + +``amd_pstate=guided`` + +If ``amd_pstate=guided`` is passed to kernel command line option then this mode +is activated. In this mode, driver requests minimum and maximum performance +level and the platform autonomously selects a performance level in this range +and appropriate to the current workload. + +User Space Interface in ``sysfs`` - General +=========================================== + +Global Attributes +----------------- + +``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to +control its functionality at the system level. They are located in the +``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs. + +``status`` + Operation mode of the driver: "active", "passive" or "disable". + + "active" + The driver is functional and in the ``active mode`` + + "passive" + The driver is functional and in the ``passive mode`` + + "guided" + The driver is functional and in the ``guided mode`` + + "disable" + The driver is unregistered and not functional now. + + This attribute can be written to in order to change the driver's + operation mode or to unregister it. The string written to it must be + one of the possible values of it and, if successful, writing one of + these values to the sysfs file will cause the driver to switch over + to the operation mode represented by that string - or to be + unregistered in the "disable" case. ``cpupower`` tool support for ``amd-pstate`` =============================================== diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 093675b1a1ff..38c1051a4520 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1153,6 +1153,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); } +/** + * cppc_get_epp_perf - Get the epp register value. + * @cpunum: CPU from which to get epp preference value. + * @epp_perf: Return address. + * + * Return: 0 for success, -EIO otherwise. + */ +int cppc_get_epp_perf(int cpunum, u64 *epp_perf) +{ + return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf); +} +EXPORT_SYMBOL_GPL(cppc_get_epp_perf); + /** * cppc_get_perf_caps - Get a CPU's performance capabilities. * @cpunum: CPU from which to get capabilities info. @@ -1365,6 +1378,157 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) } EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs); +/* + * Set Energy Performance Preference Register value through + * Performance Controls Interface + */ +int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) +{ + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cpc_register_resource *epp_set_reg; + struct cpc_register_resource *auto_sel_reg; + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; + + if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); + return -ENODEV; + } + + if (CPC_SUPPORTED(auto_sel_reg)) { + ret = cpc_write(cpu, auto_sel_reg, enable); + if (ret) + return ret; + } + + if (CPC_SUPPORTED(epp_set_reg)) { + ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); + if (ret) + return ret; + } + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + /* after writing CPC, transfer the ownership of PCC to platform */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); + } else { + ret = -ENOTSUPP; + pr_debug("_CPC in PCC is not supported\n"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(cppc_set_epp_perf); + +/* + * cppc_get_auto_sel_caps - Read autonomous selection register. + * @cpunum : CPU from which to read register. + * @perf_caps : struct where autonomous selection register value is updated. + */ +int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) +{ + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); + struct cpc_register_resource *auto_sel_reg; + u64 auto_sel; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpunum); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + + if (!CPC_SUPPORTED(auto_sel_reg)) + pr_warn_once("Autonomous mode is not unsupported!\n"); + + if (CPC_IN_PCC(auto_sel_reg)) { + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = 0; + + if (pcc_ss_id < 0) + return -ENODEV; + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + + if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) { + cpc_read(cpunum, auto_sel_reg, &auto_sel); + perf_caps->auto_sel = (bool)auto_sel; + } else { + ret = -EIO; + } + + up_write(&pcc_ss_data->pcc_lock); + + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps); + +/* + * cppc_set_auto_sel - Write autonomous selection register. + * @cpu : CPU to which to write register. + * @enable : the desired value of autonomous selection resiter to be updated. + */ +int cppc_set_auto_sel(int cpu, bool enable) +{ + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cpc_register_resource *auto_sel_reg; + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = -EINVAL; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + + if (CPC_IN_PCC(auto_sel_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id\n"); + return -ENODEV; + } + + if (CPC_SUPPORTED(auto_sel_reg)) { + ret = cpc_write(cpu, auto_sel_reg, enable); + if (ret) + return ret; + } + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + /* after writing CPC, transfer the ownership of PCC to platform */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); + } else { + ret = -ENOTSUPP; + pr_debug("_CPC in PCC is not supported\n"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(cppc_set_auto_sel); + + /** * cppc_set_enable - Set to enable CPPC on the processor by writing the * Continuous Performance Control package EnableRegister field. @@ -1420,7 +1584,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable); int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); - struct cpc_register_resource *desired_reg; + struct cpc_register_resource *desired_reg, *min_perf_reg, *max_perf_reg; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cppc_pcc_data *pcc_ss_data = NULL; int ret = 0; @@ -1431,6 +1595,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) } desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; + min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF]; + max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF]; /* * This is Phase-I where we want to write to CPC registers @@ -1439,7 +1605,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) * Since read_lock can be acquired by multiple CPUs simultaneously we * achieve that goal here */ - if (CPC_IN_PCC(desired_reg)) { + if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) { if (pcc_ss_id < 0) { pr_debug("Invalid pcc_ss_id\n"); return -ENODEV; @@ -1462,13 +1628,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) cpc_desc->write_cmd_status = 0; } - /* - * Skip writing MIN/MAX until Linux knows how to come up with - * useful values. - */ cpc_write(cpu, desired_reg, perf_ctrls->desired_perf); - if (CPC_IN_PCC(desired_reg)) + /** + * Only write if min_perf and max_perf not zero. Some drivers pass zero + * value to min and max perf, but they don't mean to set the zero value, + * they just don't want to write to those registers. + */ + if (perf_ctrls->min_perf) + cpc_write(cpu, min_perf_reg, perf_ctrls->min_perf); + if (perf_ctrls->max_perf) + cpc_write(cpu, max_perf_reg, perf_ctrls->max_perf); + + if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) up_read(&pcc_ss_data->pcc_lock); /* END Phase-I */ /* * This is Phase-II where we transfer the ownership of PCC to Platform @@ -1516,7 +1688,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) * case during a CMD_READ and if there are pending writes it delivers * the write command before servicing the read command */ - if (CPC_IN_PCC(desired_reg)) { + if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) { if (down_write_trylock(&pcc_ss_data->pcc_lock)) {/* BEGIN Phase-II */ /* Update only if there are pending write commands */ if (pcc_ss_data->pending_pcc_write_cmd) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index c17bd845f5fc..d4e60da7a544 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -59,8 +59,173 @@ * we disable it by default to go acpi-cpufreq on these processors and add a * module parameter to be able to enable it manually for debugging. */ +static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; -static int cppc_load __initdata; +static struct cpufreq_driver amd_pstate_epp_driver; +static int cppc_state = AMD_PSTATE_DISABLE; +struct kobject *amd_pstate_kobj; + +/* + * AMD Energy Preference Performance (EPP) + * The EPP is used in the CCLK DPM controller to drive + * the frequency that a core is going to operate during + * short periods of activity. EPP values will be utilized for + * different OS profiles (balanced, performance, power savings) + * display strings corresponding to EPP index in the + * energy_perf_strings[] + * index String + *------------------------------------- + * 0 default + * 1 performance + * 2 balance_performance + * 3 balance_power + * 4 power + */ +enum energy_perf_value_index { + EPP_INDEX_DEFAULT = 0, + EPP_INDEX_PERFORMANCE, + EPP_INDEX_BALANCE_PERFORMANCE, + EPP_INDEX_BALANCE_POWERSAVE, + EPP_INDEX_POWERSAVE, +}; + +static const char * const energy_perf_strings[] = { + [EPP_INDEX_DEFAULT] = "default", + [EPP_INDEX_PERFORMANCE] = "performance", + [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", + [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", + [EPP_INDEX_POWERSAVE] = "power", + NULL +}; + +static unsigned int epp_values[] = { + [EPP_INDEX_DEFAULT] = 0, + [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE, + [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, + [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, + [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, + }; + +typedef int (*cppc_mode_transition_fn)(int); + +static inline int get_mode_idx_from_str(const char *str, size_t size) +{ + int i; + + for (i=0; i < AMD_PSTATE_MAX; i++) { + if (!strncmp(str, amd_pstate_mode_string[i], size)) + return i; + } + return -EINVAL; +} + +static DEFINE_MUTEX(amd_pstate_limits_lock); +static DEFINE_MUTEX(amd_pstate_driver_lock); + +static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) +{ + u64 epp; + int ret; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (!cppc_req_cached) { + epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, + &cppc_req_cached); + if (epp) + return epp; + } + epp = (cppc_req_cached >> 24) & 0xFF; + } else { + ret = cppc_get_epp_perf(cpudata->cpu, &epp); + if (ret < 0) { + pr_debug("Could not retrieve energy perf value (%d)\n", ret); + return -EIO; + } + } + + return (s16)(epp & 0xff); +} + +static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) +{ + s16 epp; + int index = -EINVAL; + + epp = amd_pstate_get_epp(cpudata, 0); + if (epp < 0) + return epp; + + switch (epp) { + case AMD_CPPC_EPP_PERFORMANCE: + index = EPP_INDEX_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_PERFORMANCE: + index = EPP_INDEX_BALANCE_PERFORMANCE; + break; + case AMD_CPPC_EPP_BALANCE_POWERSAVE: + index = EPP_INDEX_BALANCE_POWERSAVE; + break; + case AMD_CPPC_EPP_POWERSAVE: + index = EPP_INDEX_POWERSAVE; + break; + default: + break; + } + + return index; +} + +static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) +{ + int ret; + struct cppc_perf_ctrls perf_ctrls; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + if (!ret) + cpudata->epp_cached = epp; + } else { + perf_ctrls.energy_perf = epp; + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + if (ret) { + pr_debug("failed to set energy perf value (%d)\n", ret); + return ret; + } + cpudata->epp_cached = epp; + } + + return ret; +} + +static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, + int pref_index) +{ + int epp = -EINVAL; + int ret; + + if (!pref_index) { + pr_debug("EPP pref_index is invalid\n"); + return -EINVAL; + } + + if (epp == -EINVAL) + epp = epp_values[pref_index]; + + if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { + pr_debug("EPP cannot be set under performance policy\n"); + return -EBUSY; + } + + ret = amd_pstate_set_epp(cpudata, epp); + + return ret; +} static inline int pstate_enable(bool enable) { @@ -70,11 +235,21 @@ static inline int pstate_enable(bool enable) static int cppc_enable(bool enable) { int cpu, ret = 0; + struct cppc_perf_ctrls perf_ctrls; for_each_present_cpu(cpu) { ret = cppc_set_enable(cpu, enable); if (ret) return ret; + + /* Enable autonomous mode for EPP */ + if (cppc_state == AMD_PSTATE_ACTIVE) { + /* Set desired perf as zero to allow EPP firmware control */ + perf_ctrls.desired_perf = 0; + ret = cppc_set_perf(cpu, &perf_ctrls); + if (ret) + return ret; + } } return ret; @@ -135,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) cppc_perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); - return 0; + if (cppc_state == AMD_PSTATE_ACTIVE) + return 0; + + ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf); + if (ret) { + pr_warn("failed to get auto_sel, ret: %d\n", ret); + return 0; + } + + ret = cppc_set_auto_sel(cpudata->cpu, + (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); + + if (ret) + pr_warn("failed to set auto_sel, ret: %d\n", ret); + + return ret; } DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); @@ -212,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) } static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, bool fast_switch) + u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) { u64 prev = READ_ONCE(cpudata->cppc_req_cached); u64 value = prev; des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + + if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { + min_perf = des_perf; + des_perf = 0; + } + value &= ~AMD_CPPC_MIN_PERF(~0L); value |= AMD_CPPC_MIN_PERF(min_perf); @@ -272,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy, cpufreq_freq_transition_begin(policy, &freqs); amd_pstate_update(cpudata, min_perf, des_perf, - max_perf, false); + max_perf, false, policy->governor->flags); cpufreq_freq_transition_end(policy, &freqs, false); return 0; @@ -306,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; - amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true); + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + policy->governor->flags); cpufreq_cpu_put(policy); } @@ -418,7 +615,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata) return; cpudata->boost_supported = true; - amd_pstate_driver.boost_enabled = true; + current_pstate_driver->boost_enabled = true; } static void amd_perf_ctl_reset(unsigned int cpu) @@ -501,6 +698,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; amd_pstate_boost_init(cpudata); + if (!current_pstate_driver->adjust_perf) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; return 0; @@ -561,7 +760,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, if (max_freq < 0) return max_freq; - return sprintf(&buf[0], "%u\n", max_freq); + return sysfs_emit(buf, "%u\n", max_freq); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, @@ -574,7 +773,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli if (freq < 0) return freq; - return sprintf(&buf[0], "%u\n", freq); + return sysfs_emit(buf, "%u\n", freq); } /* @@ -589,13 +788,208 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, perf = READ_ONCE(cpudata->highest_perf); - return sprintf(&buf[0], "%u\n", perf); + return sysfs_emit(buf, "%u\n", perf); +} + +static ssize_t show_energy_performance_available_preferences( + struct cpufreq_policy *policy, char *buf) +{ + int i = 0; + int offset = 0; + + while (energy_perf_strings[i] != NULL) + offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); + + sysfs_emit_at(buf, offset, "\n"); + + return offset; +} + +static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) +{ + struct amd_cpudata *cpudata = policy->driver_data; + char str_preference[21]; + ssize_t ret; + + ret = sscanf(buf, "%20s", str_preference); + if (ret != 1) + return -EINVAL; + + ret = match_string(energy_perf_strings, -1, str_preference); + if (ret < 0) + return -EINVAL; + + mutex_lock(&amd_pstate_limits_lock); + ret = amd_pstate_set_energy_pref_index(cpudata, ret); + mutex_unlock(&amd_pstate_limits_lock); + + return ret ?: count; +} + +static ssize_t show_energy_performance_preference( + struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int preference; + + preference = amd_pstate_get_energy_pref_index(cpudata); + if (preference < 0) + return preference; + + return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); +} + +static void amd_pstate_driver_cleanup(void) +{ + amd_pstate_enable(false); + cppc_state = AMD_PSTATE_DISABLE; + current_pstate_driver = NULL; +} + +static int amd_pstate_register_driver(int mode) +{ + int ret; + + if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED) + current_pstate_driver = &amd_pstate_driver; + else if (mode == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + else + return -EINVAL; + + cppc_state = mode; + ret = cpufreq_register_driver(current_pstate_driver); + if (ret) { + amd_pstate_driver_cleanup(); + return ret; + } + return 0; +} + +static int amd_pstate_unregister_driver(int dummy) +{ + cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_driver_cleanup(); + return 0; +} + +static int amd_pstate_change_mode_without_dvr_change(int mode) +{ + int cpu = 0; + + cppc_state = mode; + + if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) + return 0; + + for_each_present_cpu(cpu) { + cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); + } + + return 0; +} + +static int amd_pstate_change_driver_mode(int mode) +{ + int ret; + + ret = amd_pstate_unregister_driver(0); + if (ret) + return ret; + + ret = amd_pstate_register_driver(mode); + if (ret) + return ret; + + return 0; +} + +cppc_mode_transition_fn mode_state_machine[AMD_PSTATE_MAX][AMD_PSTATE_MAX] = { + [AMD_PSTATE_DISABLE] = { + [AMD_PSTATE_DISABLE] = NULL, + [AMD_PSTATE_PASSIVE] = amd_pstate_register_driver, + [AMD_PSTATE_ACTIVE] = amd_pstate_register_driver, + [AMD_PSTATE_GUIDED] = amd_pstate_register_driver, + }, + [AMD_PSTATE_PASSIVE] = { + [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, + [AMD_PSTATE_PASSIVE] = NULL, + [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode, + [AMD_PSTATE_GUIDED] = amd_pstate_change_mode_without_dvr_change, + }, + [AMD_PSTATE_ACTIVE] = { + [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, + [AMD_PSTATE_PASSIVE] = amd_pstate_change_driver_mode, + [AMD_PSTATE_ACTIVE] = NULL, + [AMD_PSTATE_GUIDED] = amd_pstate_change_driver_mode, + }, + [AMD_PSTATE_GUIDED] = { + [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, + [AMD_PSTATE_PASSIVE] = amd_pstate_change_mode_without_dvr_change, + [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode, + [AMD_PSTATE_GUIDED] = NULL, + }, +}; + +static ssize_t amd_pstate_show_status(char *buf) +{ + if (!current_pstate_driver) + return sysfs_emit(buf, "disable\n"); + + return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); +} + +static int amd_pstate_update_status(const char *buf, size_t size) +{ + int mode_idx; + + if (size > strlen("passive") || size < strlen("active")) + return -EINVAL; + + mode_idx = get_mode_idx_from_str(buf, size); + + if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) + return -EINVAL; + + if (mode_state_machine[cppc_state][mode_idx]) + return mode_state_machine[cppc_state][mode_idx](mode_idx); + + return 0; +} + +static ssize_t show_status(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t ret; + + mutex_lock(&amd_pstate_driver_lock); + ret = amd_pstate_show_status(buf); + mutex_unlock(&amd_pstate_driver_lock); + + return ret; +} + +static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, + const char *buf, size_t count) +{ + char *p = memchr(buf, '\n', count); + int ret; + + mutex_lock(&amd_pstate_driver_lock); + ret = amd_pstate_update_status(buf, p ? p - buf : count); + mutex_unlock(&amd_pstate_driver_lock); + + return ret < 0 ? ret : count; } cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_rw(energy_performance_preference); +cpufreq_freq_attr_ro(energy_performance_available_preferences); +define_one_global_rw(status); static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, @@ -604,6 +998,313 @@ static struct freq_attr *amd_pstate_attr[] = { NULL, }; +static struct freq_attr *amd_pstate_epp_attr[] = { + &amd_pstate_max_freq, + &amd_pstate_lowest_nonlinear_freq, + &amd_pstate_highest_perf, + &energy_performance_preference, + &energy_performance_available_preferences, + NULL, +}; + +static struct attribute *pstate_global_attributes[] = { + &status.attr, + NULL +}; + +static const struct attribute_group amd_pstate_global_attr_group = { + .attrs = pstate_global_attributes, +}; + +static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) +{ + int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; + struct amd_cpudata *cpudata; + struct device *dev; + u64 value; + + /* + * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, + * which is ideal for initialization process. + */ + amd_perf_ctl_reset(policy->cpu); + dev = get_cpu_device(policy->cpu); + if (!dev) + return -ENODEV; + + cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL); + if (!cpudata) + return -ENOMEM; + + cpudata->cpu = policy->cpu; + cpudata->epp_policy = 0; + + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; + + min_freq = amd_get_min_freq(cpudata); + max_freq = amd_get_max_freq(cpudata); + nominal_freq = amd_get_nominal_freq(cpudata); + lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); + if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { + dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", + min_freq, max_freq); + ret = -EINVAL; + goto free_cpudata1; + } + + policy->cpuinfo.min_freq = min_freq; + policy->cpuinfo.max_freq = max_freq; + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + + /* Initial processor data capability frequencies */ + cpudata->max_freq = max_freq; + cpudata->min_freq = min_freq; + cpudata->nominal_freq = nominal_freq; + cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; + + policy->driver_data = cpudata; + + cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); + + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + + /* + * Set the policy to powersave to provide a valid fallback value in case + * the default cpufreq governor is neither powersave nor performance. + */ + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + policy->fast_switch_possible = true; + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_req_cached, value); + + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + amd_pstate_boost_init(cpudata); + + return 0; + +free_cpudata1: + kfree(cpudata); + return ret; +} + +static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) +{ + pr_debug("CPU %d exiting\n", policy->cpu); + policy->fast_switch_possible = false; + return 0; +} + +static void amd_pstate_epp_init(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; + u32 max_perf, min_perf; + u64 value; + s16 epp; + + max_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); + + value = READ_ONCE(cpudata->cppc_req_cached); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + min_perf = max_perf; + + /* Initial min/max values for CPPC Performance Controls Register */ + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(max_perf); + + /* CPPC EPP feature require to set zero to the desire perf bit */ + value &= ~AMD_CPPC_DES_PERF(~0L); + value |= AMD_CPPC_DES_PERF(0); + + if (cpudata->epp_policy == cpudata->policy) + goto skip_epp; + + cpudata->epp_policy = cpudata->policy; + + /* Get BIOS pre-defined epp value */ + epp = amd_pstate_get_epp(cpudata, value); + if (epp < 0) { + /** + * This return value can only be negative for shared_memory + * systems where EPP register read/write not supported. + */ + goto skip_epp; + } + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; + + /* Set initial EPP value */ + if (boot_cpu_has(X86_FEATURE_CPPC)) { + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + } + + WRITE_ONCE(cpudata->cppc_req_cached, value); + amd_pstate_set_epp(cpudata, epp); +skip_epp: + cpufreq_cpu_put(policy); +} + +static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", + policy->cpuinfo.max_freq, policy->max); + + cpudata->policy = policy->policy; + + amd_pstate_epp_init(policy->cpu); + + return 0; +} + +static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) +{ + struct cppc_perf_ctrls perf_ctrls; + u64 value, max_perf; + int ret; + + ret = amd_pstate_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + + value = READ_ONCE(cpudata->cppc_req_cached); + max_perf = READ_ONCE(cpudata->highest_perf); + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = max_perf; + perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } +} + +static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); + + if (cppc_state == AMD_PSTATE_ACTIVE) { + amd_pstate_epp_reenable(cpudata); + cpudata->suspended = false; + } + + return 0; +} + +static void amd_pstate_epp_offline(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; + int min_perf; + u64 value; + + min_perf = READ_ONCE(cpudata->lowest_perf); + value = READ_ONCE(cpudata->cppc_req_cached); + + mutex_lock(&amd_pstate_limits_lock); + if (boot_cpu_has(X86_FEATURE_CPPC)) { + cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; + + /* Set max perf same as min perf */ + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(min_perf); + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.desired_perf = 0; + perf_ctrls.max_perf = min_perf; + perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } + mutex_unlock(&amd_pstate_limits_lock); +} + +static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); + + if (cpudata->suspended) + return 0; + + if (cppc_state == AMD_PSTATE_ACTIVE) + amd_pstate_epp_offline(policy); + + return 0; +} + +static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) +{ + cpufreq_verify_within_cpu_limits(policy); + pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); + return 0; +} + +static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int ret; + + /* avoid suspending when EPP is not enabled */ + if (cppc_state != AMD_PSTATE_ACTIVE) + return 0; + + /* set this flag to avoid setting core offline*/ + cpudata->suspended = true; + + /* disable CPPC in lowlevel firmware */ + ret = amd_pstate_enable(false); + if (ret) + pr_err("failed to suspend, return %d\n", ret); + + return 0; +} + +static int amd_pstate_epp_resume(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->suspended) { + mutex_lock(&amd_pstate_limits_lock); + + /* enable amd pstate from suspend state*/ + amd_pstate_epp_reenable(cpudata); + + mutex_unlock(&amd_pstate_limits_lock); + + cpudata->suspended = false; + } + + return 0; +} + static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, @@ -617,6 +1318,20 @@ static struct cpufreq_driver amd_pstate_driver = { .attr = amd_pstate_attr, }; +static struct cpufreq_driver amd_pstate_epp_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = amd_pstate_epp_verify_policy, + .setpolicy = amd_pstate_epp_set_policy, + .init = amd_pstate_epp_cpu_init, + .exit = amd_pstate_epp_cpu_exit, + .offline = amd_pstate_epp_cpu_offline, + .online = amd_pstate_epp_cpu_online, + .suspend = amd_pstate_epp_suspend, + .resume = amd_pstate_epp_resume, + .name = "amd_pstate_epp", + .attr = amd_pstate_epp_attr, +}; + static int __init amd_pstate_init(void) { int ret; @@ -626,10 +1341,10 @@ static int __init amd_pstate_init(void) /* * by default the pstate driver is disabled to load * enable the amd_pstate passive mode driver explicitly - * with amd_pstate=passive in kernel command line + * with amd_pstate=passive or other modes in kernel command line */ - if (!cppc_load) { - pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n"); + if (cppc_state == AMD_PSTATE_DISABLE) { + pr_debug("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } @@ -645,7 +1360,8 @@ static int __init amd_pstate_init(void) /* capability check */ if (boot_cpu_has(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); - amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf; + if (cppc_state != AMD_PSTATE_ACTIVE) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; } else { pr_debug("AMD CPPC shared memory based functionality is supported\n"); static_call_update(amd_pstate_enable, cppc_enable); @@ -656,31 +1372,63 @@ static int __init amd_pstate_init(void) /* enable amd pstate feature */ ret = amd_pstate_enable(true); if (ret) { - pr_err("failed to enable amd-pstate with return %d\n", ret); + pr_err("failed to enable with return %d\n", ret); return ret; } - ret = cpufreq_register_driver(&amd_pstate_driver); + ret = cpufreq_register_driver(current_pstate_driver); if (ret) - pr_err("failed to register amd_pstate_driver with return %d\n", - ret); + pr_err("failed to register with return %d\n", ret); + + amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj); + if (!amd_pstate_kobj) { + ret = -EINVAL; + pr_err("global sysfs registration failed.\n"); + goto kobject_free; + } + ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group); + if (ret) { + pr_err("sysfs attribute export failed with error %d.\n", ret); + goto global_attr_free; + } + + return ret; + +global_attr_free: + kobject_put(amd_pstate_kobj); +kobject_free: + cpufreq_unregister_driver(current_pstate_driver); return ret; } device_initcall(amd_pstate_init); static int __init amd_pstate_param(char *str) { + size_t size; + int mode_idx; + if (!str) return -EINVAL; - if (!strcmp(str, "disable")) { - cppc_load = 0; - pr_info("driver is explicitly disabled\n"); - } else if (!strcmp(str, "passive")) - cppc_load = 1; + size = strlen(str); + mode_idx = get_mode_idx_from_str(str, size); - return 0; + if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { + cppc_state = mode_idx; + if (cppc_state == AMD_PSTATE_DISABLE) + pr_info("driver is explicitly disabled\n"); + + if (cppc_state == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; + + if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) + current_pstate_driver = &amd_pstate_driver; + + return 0; + } + + return -EINVAL; } early_param("amd_pstate", amd_pstate_param); diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 4153150e20db..ffea6402189d 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev) static int brcm_avs_cpufreq_remove(struct platform_device *pdev) { - int ret; - - ret = cpufreq_unregister_driver(&brcm_avs_driver); - WARN_ON(ret); + cpufreq_unregister_driver(&brcm_avs_driver); brcm_avs_prepare_uninit(pdev); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7e56a42750ea..85a0bea2dbf1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver); * Returns zero if successful, and -EINVAL if the cpufreq_driver is * currently not initialised. */ -int cpufreq_unregister_driver(struct cpufreq_driver *driver) +void cpufreq_unregister_driver(struct cpufreq_driver *driver) { unsigned long flags; - if (!cpufreq_driver || (driver != cpufreq_driver)) - return -EINVAL; + if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver))) + return; pr_debug("unregistering driver %s\n", driver->name); @@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) write_unlock_irqrestore(&cpufreq_driver_lock, flags); cpus_read_unlock(); - - return 0; } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index f0e0a35c7f21..74eac0f1d2e5 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -313,7 +313,9 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); + cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); + + return 0; } static const struct of_device_id mtk_cpufreq_hw_match[] = { diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 1b50df06c6bc..81649a1969b6 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev) static int omap_cpufreq_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&omap_driver); + cpufreq_unregister_driver(&omap_driver); + + return 0; } static struct platform_driver omap_cpufreq_platdrv = { diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index d10bf7635a0d..ab5c860a837f 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -719,7 +719,9 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev) { - return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); + cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); + + return 0; } static struct platform_driver qcom_cpufreq_hw_driver = { diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index c5614444031f..6126c977ece0 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -108,12 +108,15 @@ struct cppc_perf_caps { u32 lowest_nonlinear_perf; u32 lowest_freq; u32 nominal_freq; + u32 energy_perf; + bool auto_sel; }; struct cppc_perf_ctrls { u32 max_perf; u32 min_perf; u32 desired_perf; + u32 energy_perf; }; struct cppc_perf_fb_ctrs { @@ -149,6 +152,10 @@ extern bool cpc_ffh_supported(void); extern bool cpc_supported_by_cpu(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); +extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); +extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); +extern int cppc_set_auto_sel(int cpu, bool enable); #else /* !CONFIG_ACPI_CPPC_LIB */ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) { @@ -202,6 +209,22 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) { return -ENOTSUPP; } +static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) +{ + return -ENOTSUPP; +} +static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) +{ + return -ENOTSUPP; +} +static inline int cppc_set_auto_sel(int cpu, bool enable) +{ + return -ENOTSUPP; +} +static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) +{ + return -ENOTSUPP; +} #endif /* !CONFIG_ACPI_CPPC_LIB */ #endif /* _CPPC_ACPI_H*/ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 1c4b8659f171..c10ebf8c42e6 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -12,6 +12,11 @@ #include +#define AMD_CPPC_EPP_PERFORMANCE 0x00 +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF +#define AMD_CPPC_EPP_POWERSAVE 0xFF + /********************************************************************* * AMD P-state INTERFACE * *********************************************************************/ @@ -47,6 +52,10 @@ struct amd_aperf_mperf { * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @epp_policy: Last saved policy used to set energy-performance preference + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value + * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. @@ -72,6 +81,31 @@ struct amd_cpudata { u64 freq; bool boost_supported; + + /* EPP feature related attributes*/ + s16 epp_policy; + s16 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; }; +/* + * enum amd_pstate_mode - driver working mode of amd pstate + */ +enum amd_pstate_mode { + AMD_PSTATE_DISABLE = 0, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, + AMD_PSTATE_GUIDED, + AMD_PSTATE_MAX, +}; + +static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", + NULL, +}; #endif /* _LINUX_AMD_PSTATE_H */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d5595d57f4e5..89a6d76ce1cf 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -448,7 +448,7 @@ struct cpufreq_driver { #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6) int cpufreq_register_driver(struct cpufreq_driver *driver_data); -int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); +void cpufreq_unregister_driver(struct cpufreq_driver *driver_data); bool cpufreq_driver_test_flags(u16 flags); const char *cpufreq_get_current_driver(void); -- 2.40.0 From 5ada46b5f648da47a486a079dacee032e3f83459 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Wed, 1 Feb 2023 10:30:17 +0100 Subject: [PATCH 09/17] kallsyms-modules Signed-off-by: Peter Jung --- include/linux/kallsyms.h | 9 + include/linux/module.h | 1 - init/Kconfig | 13 + kernel/Makefile | 1 + kernel/kallsyms.c | 116 ++++++++- kernel/kallsyms_internal.h | 1 + kernel/kallsyms_selftest.c | 485 +++++++++++++++++++++++++++++++++++++ kernel/kallsyms_selftest.h | 13 + kernel/livepatch/core.c | 31 ++- kernel/module/main.c | 3 - kernel/module/sysfs.c | 2 +- kernel/params.c | 23 +- scripts/kallsyms.c | 78 +++++- scripts/link-vmlinux.sh | 4 + 14 files changed, 744 insertions(+), 36 deletions(-) create mode 100644 kernel/kallsyms_selftest.c create mode 100644 kernel/kallsyms_selftest.h diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 649faac31ddb..0065209cc004 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -66,9 +66,12 @@ static inline void *dereference_symbol_descriptor(void *ptr) } #ifdef CONFIG_KALLSYMS +unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); +int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); @@ -168,6 +171,12 @@ static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct { return -EOPNOTSUPP; } + +static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data) +{ + return -EOPNOTSUPP; +} #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) diff --git a/include/linux/module.h b/include/linux/module.h index ec61fb53979a..676614d56c25 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -827,7 +827,6 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr) #ifdef CONFIG_SYSFS extern struct kset *module_kset; extern struct kobj_type module_ktype; -extern int module_sysfs_initialized; #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) diff --git a/init/Kconfig b/init/Kconfig index b5ed2afd0d0a..9143fd8da992 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1766,6 +1766,19 @@ config KALLSYMS symbolic stack backtraces. This increases the size of the kernel somewhat, as all symbols have to be loaded into the kernel image. +config KALLSYMS_SELFTEST + bool "Test the basic functions and performance of kallsyms" + depends on KALLSYMS + default n + help + Test the basic functions and performance of some interfaces, such as + kallsyms_lookup_name. It also calculates the compression rate of the + kallsyms compression algorithm for the current symbol set. + + Start self-test automatically after system startup. Suggest executing + "dmesg | grep kallsyms_selftest" to collect test results. "finish" is + displayed in the last line, indicating that the test is complete. + config KALLSYMS_ALL bool "Include all symbols in kallsyms" depends on DEBUG_KERNEL && KALLSYMS diff --git a/kernel/Makefile b/kernel/Makefile index ebc692242b68..10ef068f598d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -66,6 +66,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 60c20f301a6b..83f499182c9a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -146,7 +146,7 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } -static unsigned long kallsyms_sym_address(int idx) +unsigned long kallsyms_sym_address(int idx) { if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) return kallsyms_addresses[idx]; @@ -187,26 +187,100 @@ static bool cleanup_symbol_name(char *s) return false; } +static int compare_symbol_name(const char *name, char *namebuf) +{ + int ret; + + ret = strcmp(name, namebuf); + if (!ret) + return ret; + + if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf)) + return 0; + + return ret; +} + +static unsigned int get_symbol_seq(int index) +{ + unsigned int i, seq = 0; + + for (i = 0; i < 3; i++) + seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i]; + + return seq; +} + +static int kallsyms_lookup_names(const char *name, + unsigned int *start, + unsigned int *end) +{ + int ret; + int low, mid, high; + unsigned int seq, off; + char namebuf[KSYM_NAME_LEN]; + + low = 0; + high = kallsyms_num_syms - 1; + + while (low <= high) { + mid = low + (high - low) / 2; + seq = get_symbol_seq(mid); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + ret = compare_symbol_name(name, namebuf); + if (ret > 0) + low = mid + 1; + else if (ret < 0) + high = mid - 1; + else + break; + } + + if (low > high) + return -ESRCH; + + low = mid; + while (low) { + seq = get_symbol_seq(low - 1); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + if (compare_symbol_name(name, namebuf)) + break; + low--; + } + *start = low; + + if (end) { + high = mid; + while (high < kallsyms_num_syms - 1) { + seq = get_symbol_seq(high + 1); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + if (compare_symbol_name(name, namebuf)) + break; + high++; + } + *end = high; + } + + return 0; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { - char namebuf[KSYM_NAME_LEN]; - unsigned long i; - unsigned int off; + int ret; + unsigned int i; /* Skip the search for empty string. */ if (!*name) return 0; - for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - - if (strcmp(namebuf, name) == 0) - return kallsyms_sym_address(i); + ret = kallsyms_lookup_names(name, &i, NULL); + if (!ret) + return kallsyms_sym_address(get_symbol_seq(i)); - if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0) - return kallsyms_sym_address(i); - } return module_kallsyms_lookup_name(name); } @@ -233,6 +307,24 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, return 0; } +int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data) +{ + int ret; + unsigned int i, start, end; + + ret = kallsyms_lookup_names(name, &start, &end); + if (ret) + return 0; + + for (i = start; !ret && i <= end; i++) { + ret = fn(data, kallsyms_sym_address(get_symbol_seq(i))); + cond_resched(); + } + + return ret; +} + static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 2d0c6f2f0243..27fabdcc40f5 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -26,5 +26,6 @@ extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; extern const unsigned int kallsyms_markers[] __weak; +extern const u8 kallsyms_seqs_of_names[] __weak; #endif // LINUX_KALLSYMS_INTERNAL_H_ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c new file mode 100644 index 000000000000..f35d9cc1aab1 --- /dev/null +++ b/kernel/kallsyms_selftest.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test the function and performance of kallsyms + * + * Copyright (C) Huawei Technologies Co., Ltd., 2022 + * + * Authors: Zhen Lei Huawei + */ + +#define pr_fmt(fmt) "kallsyms_selftest: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "kallsyms_internal.h" +#include "kallsyms_selftest.h" + + +#define MAX_NUM_OF_RECORDS 64 + +struct test_stat { + int min; + int max; + int save_cnt; + int real_cnt; + int perf; + u64 sum; + char *name; + unsigned long addr; + unsigned long addrs[MAX_NUM_OF_RECORDS]; +}; + +struct test_item { + char *name; + unsigned long addr; +}; + +#define ITEM_FUNC(s) \ + { \ + .name = #s, \ + .addr = (unsigned long)s, \ + } + +#define ITEM_DATA(s) \ + { \ + .name = #s, \ + .addr = (unsigned long)&s, \ + } + + +static int kallsyms_test_var_bss_static; +static int kallsyms_test_var_data_static = 1; +int kallsyms_test_var_bss; +int kallsyms_test_var_data = 1; + +static int kallsyms_test_func_static(void) +{ + kallsyms_test_var_bss_static++; + kallsyms_test_var_data_static++; + + return 0; +} + +int kallsyms_test_func(void) +{ + return kallsyms_test_func_static(); +} + +__weak int kallsyms_test_func_weak(void) +{ + kallsyms_test_var_bss++; + kallsyms_test_var_data++; + return 0; +} + +static struct test_item test_items[] = { + ITEM_FUNC(kallsyms_test_func_static), + ITEM_FUNC(kallsyms_test_func), + ITEM_FUNC(kallsyms_test_func_weak), + ITEM_FUNC(vmalloc), + ITEM_FUNC(vfree), +#ifdef CONFIG_KALLSYMS_ALL + ITEM_DATA(kallsyms_test_var_bss_static), + ITEM_DATA(kallsyms_test_var_data_static), + ITEM_DATA(kallsyms_test_var_bss), + ITEM_DATA(kallsyms_test_var_data), + ITEM_DATA(vmap_area_list), +#endif +}; + +static char stub_name[KSYM_NAME_LEN]; + +static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr) +{ + *(u32 *)data += strlen(name); + + return 0; +} + +static void test_kallsyms_compression_ratio(void) +{ + u32 pos, off, len, num; + u32 ratio, total_size, total_len = 0; + + kallsyms_on_each_symbol(stat_symbol_len, &total_len); + + /* + * A symbol name cannot start with a number. This stub name helps us + * traverse the entire symbol table without finding a match. It's used + * for subsequent performance tests, and its length is the average + * length of all symbol names. + */ + memset(stub_name, '4', sizeof(stub_name)); + pos = total_len / kallsyms_num_syms; + stub_name[pos] = 0; + + pos = 0; + num = 0; + off = 0; + while (pos < kallsyms_num_syms) { + len = kallsyms_names[off]; + num++; + off++; + pos++; + if ((len & 0x80) != 0) { + len = (len & 0x7f) | (kallsyms_names[off] << 7); + num++; + off++; + } + off += len; + } + + /* + * 1. The length fields is not counted + * 2. The memory occupied by array kallsyms_token_table[] and + * kallsyms_token_index[] needs to be counted. + */ + total_size = off - num; + pos = kallsyms_token_index[0xff]; + total_size += pos + strlen(&kallsyms_token_table[pos]) + 1; + total_size += 0x100 * sizeof(u16); + + pr_info(" ---------------------------------------------------------\n"); + pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n"); + pr_info("|---------------------------------------------------------|\n"); + ratio = (u32)div_u64(10000ULL * total_size, total_len); + pr_info("| %10d | %10d | %10d | %2d.%-2d |\n", + kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100); + pr_info(" ---------------------------------------------------------\n"); +} + +static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr) +{ + u64 t0, t1, t; + unsigned long flags; + struct test_stat *stat = (struct test_stat *)data; + + local_irq_save(flags); + t0 = sched_clock(); + (void)kallsyms_lookup_name(name); + t1 = sched_clock(); + local_irq_restore(flags); + + t = t1 - t0; + if (t < stat->min) + stat->min = t; + + if (t > stat->max) + stat->max = t; + + stat->real_cnt++; + stat->sum += t; + + return 0; +} + +static void test_perf_kallsyms_lookup_name(void) +{ + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.min = INT_MAX; + kallsyms_on_each_symbol(lookup_name, &stat); + pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt); + pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n", + stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); +} + +static bool match_cleanup_name(const char *s, const char *name) +{ + char *p; + int len; + + if (!IS_ENABLED(CONFIG_LTO_CLANG)) + return false; + + p = strchr(s, '.'); + if (!p) + return false; + + len = strlen(name); + if (p - s != len) + return false; + + return !strncmp(s, name, len); +} + +static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr) +{ + struct test_stat *stat = (struct test_stat *)data; + + if (strcmp(name, stat->name) == 0 || + (!stat->perf && match_cleanup_name(name, stat->name))) { + stat->real_cnt++; + stat->addr = addr; + + if (stat->save_cnt < MAX_NUM_OF_RECORDS) { + stat->addrs[stat->save_cnt] = addr; + stat->save_cnt++; + } + + if (stat->real_cnt == stat->max) + return 1; + } + + return 0; +} + +static void test_perf_kallsyms_on_each_symbol(void) +{ + u64 t0, t1; + unsigned long flags; + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.max = INT_MAX; + stat.name = stub_name; + stat.perf = 1; + local_irq_save(flags); + t0 = sched_clock(); + kallsyms_on_each_symbol(find_symbol, &stat); + t1 = sched_clock(); + local_irq_restore(flags); + pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0); +} + +static int match_symbol(void *data, unsigned long addr) +{ + struct test_stat *stat = (struct test_stat *)data; + + stat->real_cnt++; + stat->addr = addr; + + if (stat->save_cnt < MAX_NUM_OF_RECORDS) { + stat->addrs[stat->save_cnt] = addr; + stat->save_cnt++; + } + + if (stat->real_cnt == stat->max) + return 1; + + return 0; +} + +static void test_perf_kallsyms_on_each_match_symbol(void) +{ + u64 t0, t1; + unsigned long flags; + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.max = INT_MAX; + stat.name = stub_name; + local_irq_save(flags); + t0 = sched_clock(); + kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat); + t1 = sched_clock(); + local_irq_restore(flags); + pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0); +} + +static int test_kallsyms_basic_function(void) +{ + int i, j, ret; + int next = 0, nr_failed = 0; + char *prefix; + unsigned short rand; + unsigned long addr, lookup_addr; + char namebuf[KSYM_NAME_LEN]; + struct test_stat *stat, *stat2; + + stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + if (!stat) + return -ENOMEM; + stat2 = stat + 1; + + prefix = "kallsyms_lookup_name() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + addr = kallsyms_lookup_name(test_items[i].name); + if (addr != test_items[i].addr) { + nr_failed++; + pr_info("%s %s failed: addr=%lx, expect %lx\n", + prefix, test_items[i].name, addr, test_items[i].addr); + } + } + + prefix = "kallsyms_on_each_symbol() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + stat->name = test_items[i].name; + kallsyms_on_each_symbol(find_symbol, stat); + if (stat->addr != test_items[i].addr || stat->real_cnt != 1) { + nr_failed++; + pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n", + prefix, test_items[i].name, + stat->real_cnt, stat->addr, test_items[i].addr); + } + } + + prefix = "kallsyms_on_each_match_symbol() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + stat->name = test_items[i].name; + kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat); + if (stat->addr != test_items[i].addr || stat->real_cnt != 1) { + nr_failed++; + pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n", + prefix, test_items[i].name, + stat->real_cnt, stat->addr, test_items[i].addr); + } + } + + if (nr_failed) { + kfree(stat); + return -ESRCH; + } + + for (i = 0; i < kallsyms_num_syms; i++) { + addr = kallsyms_sym_address(i); + if (!is_ksym_addr(addr)) + continue; + + ret = lookup_symbol_name(addr, namebuf); + if (unlikely(ret)) { + namebuf[0] = 0; + goto failed; + } + + /* + * The first '.' may be the initial letter, in which case the + * entire symbol name will be truncated to an empty string in + * cleanup_symbol_name(). Do not test these symbols. + * + * For example: + * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head + * .E_read_words + * .E_leading_bytes + * .E_trailing_bytes + * .E_write_words + * .E_copy + * .str.292.llvm.12122243386960820698 + * .str.24.llvm.12122243386960820698 + * .str.29.llvm.12122243386960820698 + * .str.75.llvm.12122243386960820698 + * .str.99.llvm.12122243386960820698 + */ + if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0]) + continue; + + lookup_addr = kallsyms_lookup_name(namebuf); + + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + kallsyms_on_each_match_symbol(match_symbol, namebuf, stat); + + /* + * kallsyms_on_each_symbol() is too slow, randomly select some + * symbols for test. + */ + if (i >= next) { + memset(stat2, 0, sizeof(*stat2)); + stat2->max = INT_MAX; + stat2->name = namebuf; + kallsyms_on_each_symbol(find_symbol, stat2); + + /* + * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol() + * need to get the same traversal result. + */ + if (stat->addr != stat2->addr || + stat->real_cnt != stat2->real_cnt || + memcmp(stat->addrs, stat2->addrs, + stat->save_cnt * sizeof(stat->addrs[0]))) + goto failed; + + /* + * The average of random increments is 128, that is, one of + * them is tested every 128 symbols. + */ + get_random_bytes(&rand, sizeof(rand)); + next = i + (rand & 0xff) + 1; + } + + /* Need to be found at least once */ + if (!stat->real_cnt) + goto failed; + + /* + * kallsyms_lookup_name() returns the address of the first + * symbol found and cannot be NULL. + */ + if (!lookup_addr || lookup_addr != stat->addrs[0]) + goto failed; + + /* + * If the addresses of all matching symbols are recorded, the + * target address needs to be exist. + */ + if (stat->real_cnt <= MAX_NUM_OF_RECORDS) { + for (j = 0; j < stat->save_cnt; j++) { + if (stat->addrs[j] == addr) + break; + } + + if (j == stat->save_cnt) + goto failed; + } + } + + kfree(stat); + + return 0; + +failed: + pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr); + kfree(stat); + return -ESRCH; +} + +static int test_entry(void *p) +{ + int ret; + + do { + schedule_timeout(5 * HZ); + } while (system_state != SYSTEM_RUNNING); + + pr_info("start\n"); + ret = test_kallsyms_basic_function(); + if (ret) { + pr_info("abort\n"); + return 0; + } + + test_kallsyms_compression_ratio(); + test_perf_kallsyms_lookup_name(); + test_perf_kallsyms_on_each_symbol(); + test_perf_kallsyms_on_each_match_symbol(); + pr_info("finish\n"); + + return 0; +} + +static int __init kallsyms_test_init(void) +{ + struct task_struct *t; + + t = kthread_create(test_entry, NULL, "kallsyms_test"); + if (IS_ERR(t)) { + pr_info("Create kallsyms selftest task failed\n"); + return PTR_ERR(t); + } + kthread_bind(t, 0); + wake_up_process(t); + + return 0; +} +late_initcall(kallsyms_test_init); diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h new file mode 100644 index 000000000000..c0ca548e2a22 --- /dev/null +++ b/kernel/kallsyms_selftest.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef LINUX_KALLSYMS_SELFTEST_H_ +#define LINUX_KALLSYMS_SELFTEST_H_ + +#include + +extern int kallsyms_test_var_bss; +extern int kallsyms_test_var_data; + +extern int kallsyms_test_func(void); +extern int kallsyms_test_func_weak(void); + +#endif // LINUX_KALLSYMS_SELFTEST_H_ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 9ada0bc5247b..201f0c0482fb 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -125,20 +125,10 @@ struct klp_find_arg { unsigned long pos; }; -static int klp_find_callback(void *data, const char *name, - struct module *mod, unsigned long addr) +static int klp_match_callback(void *data, unsigned long addr) { struct klp_find_arg *args = data; - if ((mod && !args->objname) || (!mod && args->objname)) - return 0; - - if (strcmp(args->name, name)) - return 0; - - if (args->objname && strcmp(args->objname, mod->name)) - return 0; - args->addr = addr; args->count++; @@ -153,6 +143,23 @@ static int klp_find_callback(void *data, const char *name, return 0; } +static int klp_find_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_find_arg *args = data; + + if ((mod && !args->objname) || (!mod && args->objname)) + return 0; + + if (strcmp(args->name, name)) + return 0; + + if (args->objname && strcmp(args->objname, mod->name)) + return 0; + + return klp_match_callback(data, addr); +} + static int klp_find_object_symbol(const char *objname, const char *name, unsigned long sympos, unsigned long *addr) { @@ -167,7 +174,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (objname) module_kallsyms_on_each_symbol(klp_find_callback, &args); else - kallsyms_on_each_symbol(klp_find_callback, &args); + kallsyms_on_each_match_symbol(klp_match_callback, name, &args); /* * Ensure an address was found. If sympos is 0, ensure symbol is unique; diff --git a/kernel/module/main.c b/kernel/module/main.c index 7a627345d4fd..e0e9e4482ae2 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -85,9 +85,6 @@ struct mod_tree_root mod_data_tree __cacheline_aligned = { }; #endif -#define module_addr_min mod_tree.addr_min -#define module_addr_max mod_tree.addr_max - struct symsearch { const struct kernel_symbol *start, *stop; const s32 *crcs; diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index ce68f821dcd1..c921bf044050 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -340,7 +340,7 @@ static int mod_sysfs_init(struct module *mod) int err; struct kobject *kobj; - if (!module_sysfs_initialized) { + if (!module_kset) { pr_err("%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; diff --git a/kernel/params.c b/kernel/params.c index 5b92310425c5..a06f80c56f19 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -940,7 +940,6 @@ static const struct kset_uevent_ops module_uevent_ops = { }; struct kset *module_kset; -int module_sysfs_initialized; static void module_kobj_release(struct kobject *kobj) { @@ -954,7 +953,11 @@ struct kobj_type module_ktype = { }; /* - * param_sysfs_init - wrapper for built-in params support + * param_sysfs_init - create "module" kset + * + * This must be done before the initramfs is unpacked and + * request_module() thus becomes possible, because otherwise the + * module load would fail in mod_sysfs_init. */ static int __init param_sysfs_init(void) { @@ -964,13 +967,25 @@ static int __init param_sysfs_init(void) __FILE__, __LINE__); return -ENOMEM; } - module_sysfs_initialized = 1; + + return 0; +} +subsys_initcall(param_sysfs_init); + +/* + * param_sysfs_builtin_init - add sysfs version and parameter + * attributes for built-in modules + */ +static int __init param_sysfs_builtin_init(void) +{ + if (!module_kset) + return -ENOMEM; version_sysfs_builtin(); param_sysfs_builtin(); return 0; } -subsys_initcall(param_sysfs_init); +late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 03fa07ad45d9..04e04fbd9625 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -49,6 +49,7 @@ _Static_assert( struct sym_entry { unsigned long long addr; unsigned int len; + unsigned int seq; unsigned int start_pos; unsigned int percpu_absolute; unsigned char sym[]; @@ -77,6 +78,7 @@ static unsigned int table_size, table_cnt; static int all_symbols; static int absolute_percpu; static int base_relative; +static int lto_clang; static int token_profit[0x10000]; @@ -88,7 +90,7 @@ static unsigned char best_table_len[256]; static void usage(void) { fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] " - "[--base-relative] in.map > out.S\n"); + "[--base-relative] [--lto-clang] in.map > out.S\n"); exit(1); } @@ -410,6 +412,65 @@ static int symbol_absolute(const struct sym_entry *s) return s->percpu_absolute; } +static char * s_name(char *buf) +{ + /* Skip the symbol type */ + return buf + 1; +} + +static void cleanup_symbol_name(char *s) +{ + char *p; + + if (!lto_clang) + return; + + /* + * ASCII[.] = 2e + * ASCII[0-9] = 30,39 + * ASCII[A-Z] = 41,5a + * ASCII[_] = 5f + * ASCII[a-z] = 61,7a + * + * As above, replacing '.' with '\0' does not affect the main sorting, + * but it helps us with subsorting. + */ + p = strchr(s, '.'); + if (p) + *p = '\0'; +} + +static int compare_names(const void *a, const void *b) +{ + int ret; + char sa_namebuf[KSYM_NAME_LEN]; + char sb_namebuf[KSYM_NAME_LEN]; + const struct sym_entry *sa = *(const struct sym_entry **)a; + const struct sym_entry *sb = *(const struct sym_entry **)b; + + expand_symbol(sa->sym, sa->len, sa_namebuf); + expand_symbol(sb->sym, sb->len, sb_namebuf); + cleanup_symbol_name(s_name(sa_namebuf)); + cleanup_symbol_name(s_name(sb_namebuf)); + ret = strcmp(s_name(sa_namebuf), s_name(sb_namebuf)); + if (!ret) { + if (sa->addr > sb->addr) + return 1; + else if (sa->addr < sb->addr) + return -1; + + /* keep old order */ + return (int)(sa->seq - sb->seq); + } + + return ret; +} + +static void sort_symbols_by_name(void) +{ + qsort(table, table_cnt, sizeof(table[0]), compare_names); +} + static void write_src(void) { unsigned int i, k, off; @@ -495,6 +556,7 @@ static void write_src(void) for (i = 0; i < table_cnt; i++) { if ((i & 0xFF) == 0) markers[i >> 8] = off; + table[i]->seq = i; /* There cannot be any symbol of length zero. */ if (table[i]->len == 0) { @@ -535,6 +597,15 @@ static void write_src(void) free(markers); + sort_symbols_by_name(); + output_label("kallsyms_seqs_of_names"); + for (i = 0; i < table_cnt; i++) + printf("\t.byte 0x%02x, 0x%02x, 0x%02x\n", + (unsigned char)(table[i]->seq >> 16), + (unsigned char)(table[i]->seq >> 8), + (unsigned char)(table[i]->seq >> 0)); + printf("\n"); + output_label("kallsyms_token_table"); off = 0; for (i = 0; i < 256; i++) { @@ -573,7 +644,7 @@ static void forget_symbol(const unsigned char *symbol, int len) } /* do the initial token count */ -static void build_initial_tok_table(void) +static void build_initial_token_table(void) { unsigned int i; @@ -698,7 +769,7 @@ static void insert_real_symbols_in_table(void) static void optimize_token_table(void) { - build_initial_tok_table(); + build_initial_token_table(); insert_real_symbols_in_table(); @@ -818,6 +889,7 @@ int main(int argc, char **argv) {"all-symbols", no_argument, &all_symbols, 1}, {"absolute-percpu", no_argument, &absolute_percpu, 1}, {"base-relative", no_argument, &base_relative, 1}, + {"lto-clang", no_argument, <o_clang, 1}, {}, }; diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 918470d768e9..32e573943cf0 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -156,6 +156,10 @@ kallsyms() kallsymopt="${kallsymopt} --base-relative" fi + if is_enabled CONFIG_LTO_CLANG; then + kallsymopt="${kallsymopt} --lto-clang" + fi + info KSYMS ${2} scripts/kallsyms ${kallsymopt} ${1} > ${2} } -- 2.40.0 From e2e97c4a556e4b49f8a64100720a399c5a51a698 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 17 Oct 2022 21:09:17 +0200 Subject: [PATCH 10/17] ksm Signed-off-by: Peter Jung --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/ksm.h | 4 + include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 5 +- kernel/sys_ni.c | 1 + mm/ksm.c | 88 +++++++++------ mm/madvise.c | 113 ++++++++++++++++++++ 24 files changed, 198 insertions(+), 34 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 8ebacf37a8cf..c9d25f85d86d 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -490,3 +490,4 @@ 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv 560 common set_mempolicy_home_node sys_ni_syscall +561 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index ac964612d8b0..90933eabe115 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -464,3 +464,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 037feba03a51..64a514f90131 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 451 +#define __NR_compat_syscalls 452 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 604a2053d006..91f2bb7199af 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_pmadv_ksm 451 +__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 72c929d9902b..0d5b1d14b2b5 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -371,3 +371,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index b1f3940bc298..5ccf925567da 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -450,3 +450,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 820145e47350..6b76208597f3 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -456,3 +456,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 253ff994ed2e..e4aeedb17c38 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -389,3 +389,4 @@ 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv 450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node +451 n32 pmadv_ksm sys_pmadv_ksm diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 3f1886ad9d80..fe88db51efa0 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -365,3 +365,4 @@ 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 n64 pmadv_ksm sys_pmadv_ksm diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 8f243e35a7b2..674cb940bd15 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -438,3 +438,4 @@ 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv 450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node +451 o32 pmadv_ksm sys_pmadv_ksm diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 0e42fceb2d5e..5914aa460255 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -448,3 +448,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index a0be127475b1..347894da4eb6 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -537,3 +537,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 799147658dee..1cd523748bd2 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm sys_pmadv_ksm diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 2de85c977f54..cfc75fa43eae 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 4398cc6fb68d..d2c0a6426f6b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -496,3 +496,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..331aaf1a782f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 pmadv_ksm sys_pmadv_ksm diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..14902db4c01f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 52c94ab5c205..1518e261d882 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -421,3 +421,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 7e232ba59b86..632a1a792ebb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -16,6 +16,10 @@ #include #ifdef CONFIG_KSM +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *vm_flags); +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long *vm_flags); int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); int __ksm_enter(struct mm_struct *mm); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a34b0f9a9972..82afad91de9f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -917,6 +917,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); +asmlinkage long sys_pmadv_ksm(int pidfd, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 45fa180cc56a..40f7e6d04af0 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_pmadv_ksm 451 +__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..810e1fcaff94 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,6 +292,7 @@ COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); +COND_SYSCALL(pmadv_ksm); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL(get_mempolicy); diff --git a/mm/ksm.c b/mm/ksm.c index 7009cf42be76..f66137cb1f8c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2443,54 +2443,78 @@ static int ksm_scan_thread(void *nothing) return 0; } -int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *vm_flags) { - struct mm_struct *mm = vma->vm_mm; int err; - switch (advice) { - case MADV_MERGEABLE: - /* - * Be somewhat over-protective for now! - */ - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_MIXEDMAP)) - return 0; /* just ignore the advice */ + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP)) + return 0; /* just ignore the advice */ - if (vma_is_dax(vma)) - return 0; + if (vma_is_dax(vma)) + return 0; #ifdef VM_SAO if (*vm_flags & VM_SAO) return 0; #endif #ifdef VM_SPARC_ADI - if (*vm_flags & VM_SPARC_ADI) - return 0; + if (*vm_flags & VM_SPARC_ADI) + return 0; #endif - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { - err = __ksm_enter(mm); - if (err) - return err; - } + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } - *vm_flags |= VM_MERGEABLE; - break; + *vm_flags |= VM_MERGEABLE; - case MADV_UNMERGEABLE: - if (!(*vm_flags & VM_MERGEABLE)) - return 0; /* just ignore the advice */ + return 0; +} - if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end); - if (err) - return err; - } +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long *vm_flags) +{ + int err; + + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } - *vm_flags &= ~VM_MERGEABLE; + *vm_flags &= ~VM_MERGEABLE; + + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + err = ksm_madvise_merge(mm, vma, vm_flags); + if (err) + return err; + break; + + case MADV_UNMERGEABLE: + err = ksm_madvise_unmerge(vma, start, end, vm_flags); + if (err) + return err; break; } diff --git a/mm/madvise.c b/mm/madvise.c index d03e149ffe6e..07da4f464e39 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1512,3 +1512,116 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, out: return ret; } + +SYSCALL_DEFINE3(pmadv_ksm, int, pidfd, int, behaviour, unsigned int, flags) +{ +#ifdef CONFIG_KSM + ssize_t ret; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + unsigned int f_flags; + struct vm_area_struct *vma; + struct vma_iterator vmi; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + switch (behaviour) { + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + break; + default: + ret = -EINVAL; + goto out; + break; + } + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto out; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + /* Require CAP_SYS_NICE for influencing process performance. */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + + if (mmap_write_lock_killable(mm)) { + ret = -EINTR; + goto release_mm; + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + switch (behaviour) { + case MADV_MERGEABLE: + ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags); + break; + case MADV_UNMERGEABLE: + ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags); + break; + default: + /* look, ma, no brain */ + break; + } + if (ret) + break; + } + + mmap_write_unlock(mm); + +release_mm: + mmput(mm); +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +out: + return ret; +#else /* CONFIG_KSM */ + return -ENOSYS; +#endif /* CONFIG_KSM */ +} + +#ifdef CONFIG_KSM +static ssize_t ksm_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", __NR_pmadv_ksm); +} +static struct kobj_attribute pmadv_ksm_attr = __ATTR_RO(ksm); + +static struct attribute *pmadv_sysfs_attrs[] = { + &pmadv_ksm_attr.attr, + NULL, +}; + +static const struct attribute_group pmadv_sysfs_attr_group = { + .attrs = pmadv_sysfs_attrs, + .name = "pmadv", +}; + +static int __init pmadv_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &pmadv_sysfs_attr_group); +} +subsys_initcall(pmadv_sysfs_init); +#endif /* CONFIG_KSM */ -- 2.40.0 From 87ea63fb15c03ebc6f28cd5b4d2e02debc9c4c2f Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 14 Feb 2023 20:41:05 +0100 Subject: [PATCH 11/17] maple-lru Signed-off-by: Peter Jung --- Documentation/mm/multigen_lru.rst | 128 +++- include/linux/fs.h | 2 + include/linux/maple_tree.h | 2 +- include/linux/memcontrol.h | 10 + include/linux/mm_inline.h | 19 +- include/linux/mmzone.h | 120 +++- lib/maple_tree.c | 104 +-- mm/fadvise.c | 5 +- mm/memcontrol.c | 12 + mm/memory.c | 7 +- mm/mmap.c | 16 +- mm/nommu.c | 8 +- mm/page_alloc.c | 1 + mm/rmap.c | 42 +- mm/vmscan.c | 1096 ++++++++++++++++++----------- mm/workingset.c | 4 +- tools/testing/radix-tree/maple.c | 50 +- 17 files changed, 1082 insertions(+), 544 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d7062c6a8946..52ed5092022f 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -89,21 +89,22 @@ variables are monotonically increasing. Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` bits in order to fit into the gen counter in ``folio->flags``. Each -truncated generation number is an index to ``lrugen->lists[]``. The +truncated generation number is an index to ``lrugen->folios[]``. The sliding window technique is used to track at least ``MIN_NR_GENS`` and at most ``MAX_NR_GENS`` generations. The gen counter stores a value within ``[1, MAX_NR_GENS]`` while a page is on one of -``lrugen->lists[]``; otherwise it stores zero. +``lrugen->folios[]``; otherwise it stores zero. Each generation is divided into multiple tiers. A page accessed ``N`` times through file descriptors is in tier ``order_base_2(N)``. Unlike -generations, tiers do not have dedicated ``lrugen->lists[]``. In +generations, tiers do not have dedicated ``lrugen->folios[]``. In contrast to moving across generations, which requires the LRU lock, moving across tiers only involves atomic operations on ``folio->flags`` and therefore has a negligible cost. A feedback loop modeled after the PID controller monitors refaults over all the tiers from anon and file types and decides which tiers from which types to -evict or protect. +evict or protect. The desired effect is to balance refault percentages +between anon and file types proportional to the swappiness level. There are two conceptually independent procedures: the aging and the eviction. They form a closed-loop system, i.e., the page reclaim. @@ -127,7 +128,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. Eviction -------- The eviction consumes old generations. Given an ``lruvec``, it -increments ``min_seq`` when ``lrugen->lists[]`` indexed by +increments ``min_seq`` when ``lrugen->folios[]`` indexed by ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to evict from, it first compares ``min_seq[]`` to select the older type. If both types are equally old, it selects the one whose first tier has @@ -141,15 +142,124 @@ loop has detected outlying refaults from the tier this page is in. To this end, the feedback loop uses the first tier as the baseline, for the reason stated earlier. +Working set protection +---------------------- +Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is +set, an ``lruvec`` is protected from the eviction when its oldest +generation was born within ``lru_gen_min_ttl`` milliseconds. In other +words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds +from getting evicted. The OOM killer is triggered if this working set +cannot be kept in memory. + +This time-based approach has the following advantages: + +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + +``mm_struct`` list +------------------ +An ``mm_struct`` list is maintained for each memcg, and an +``mm_struct`` follows its owner task to the new memcg when this task +is migrated. + +A page table walker iterates ``lruvec_memcg()->mm_list`` and calls +``walk_page_range()`` with each ``mm_struct`` on this list to scan +PTEs. When multiple page table walkers iterate the same list, each of +them gets a unique ``mm_struct``, and therefore they can run in +parallel. + +Page table walkers ignore any misplaced pages, e.g., if an +``mm_struct`` was migrated, pages left in the previous memcg will be +ignored when the current memcg is under reclaim. Similarly, page table +walkers will ignore pages from nodes other than the one under reclaim. + +This infrastructure also tracks the usage of ``mm_struct`` between +context switches so that page table walkers can skip processes that +have been sleeping since the last iteration. + +Rmap/PT walk feedback +--------------------- +Searching the rmap for PTEs mapping each page on an LRU list (to test +and clear the accessed bit) can be expensive because pages from +different VMAs (PA space) are not cache friendly to the rmap (VA +space). For workloads mostly using mapped pages, searching the rmap +can incur the highest CPU cost in the reclaim path. + +``lru_gen_look_around()`` exploits spatial locality to reduce the +trips into the rmap. It scans the adjacent PTEs of a young PTE and +promotes hot pages. If the scan was done cacheline efficiently, it +adds the PMD entry pointing to the PTE table to the Bloom filter. This +forms a feedback loop between the eviction and the aging. + +Bloom filters +------------- +Bloom filters are a space and memory efficient data structure for set +membership test, i.e., test if an element is not in the set or may be +in the set. + +In the eviction path, specifically, in ``lru_gen_look_around()``, if a +PMD has a sufficient number of hot pages, its address is placed in the +filter. In the aging path, set membership means that the PTE range +will be scanned for young pages. + +Note that Bloom filters are probabilistic on set membership. If a test +is false positive, the cost is an additional scan of a range of PTEs, +which may yield hot pages anyway. Parameters of the filter itself can +control the false positive rate in the limit. + +PID controller +-------------- +A feedback loop modeled after the Proportional-Integral-Derivative +(PID) controller monitors refaults over anon and file types and +decides which type to evict when both types are available from the +same generation. + +The PID controller uses generations rather than the wall clock as the +time domain because a CPU can scan pages at different rates under +varying memory pressure. It calculates a moving average for each new +generation to avoid being permanently locked in a suboptimal state. + +Memcg LRU +--------- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +since each node and memcg combination has an LRU of folios (see +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of +global reclaim, which is critical to system-wide memory overcommit in +data centers. Note that memcg LRU only applies to global reclaim. + +The basic structure of an memcg LRU can be understood by an analogy to +the active/inactive LRU (of folios): + +1. It has the young and the old (generations), i.e., the counterparts + to the active and the inactive; +2. The increment of ``max_seq`` triggers promotion, i.e., the + counterpart to activation; +3. Other events trigger similar operations, e.g., offlining an memcg + triggers demotion, i.e., the counterpart to deactivation. + +In terms of global reclaim, it has two distinct features: + +1. Sharding, which allows each thread to start at a random memcg (in + the old generation) and improves parallelism; +2. Eventual fairness, which allows direct reclaim to bail out at will + and reduces latency without affecting fairness over some time. + +In terms of traversing memcgs during global reclaim, it improves the +best-case complexity from O(n) to O(1) and does not affect the +worst-case complexity O(n). Therefore, on average, it has a sublinear +complexity. + Summary ------- -The multi-gen LRU can be disassembled into the following parts: +The multi-gen LRU (of folios) can be disassembled into the following +parts: * Generations * Rmap walks -* Page table walks -* Bloom filters -* PID controller +* Page table walks via ``mm_struct`` list +* Bloom filters for rmap/PT walk feedback +* PID controller for refault feedback The aging and the eviction form a producer-consumer model; specifically, the latter drives the former by the sliding window over diff --git a/include/linux/fs.h b/include/linux/fs.h index f14ecbeab2a9..97f9c41c1a43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_NOREUSE ((__force fmode_t)0x800000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 815a27661517..a7bf58fd7cc6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -455,7 +455,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e03976302956..82d28b052a9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -790,6 +790,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) percpu_ref_put(&objcg->refcnt); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1290,6 +1295,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e8ed225d8f7c..5567f4850243 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); @@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) - list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else - list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -578,4 +578,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, #endif } +static inline bool vma_has_recency(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) + return false; + + if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) + return false; + + return true; +} + #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..1a273c0ff0b3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -312,7 +313,7 @@ enum lruvec_flags { * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the @@ -324,8 +325,8 @@ enum lruvec_flags { * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->lists[] so that - * the aging needs not to worry about it. And it's set again when a page + * PG_active is always cleared while a page is on one of lrugen->folios[] so + * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * @@ -404,7 +405,7 @@ enum { * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ -struct lru_gen_struct { +struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ @@ -412,7 +413,7 @@ struct lru_gen_struct { /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ @@ -426,6 +427,14 @@ struct lru_gen_struct { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; +#ifdef CONFIG_MEMCG + /* the memcg generation this lru_gen_folio belongs to */ + u8 gen; + /* the list segment this lru_gen_folio belongs to */ + u8 seg; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_node list; +#endif }; enum { @@ -461,7 +470,7 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_struct */ + /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; @@ -479,12 +488,83 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG + +/* + * For each node, memcgs are divided into two generations: the old and the + * young. For each generation, memcgs are randomly sharded into multiple bins + * to improve scalability. For each bin, the hlist_nulls is virtually divided + * into three segments: the head, the tail and the default. + * + * An onlining memcg is added to the tail of a random bin in the old generation. + * The eviction starts at the head of a random bin in the old generation. The + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes + * the old generation, is incremented when all its bins become empty. + * + * There are four operations: + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * current generation (old or young) and updates its "seg" to "head"; + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * current generation (old or young) and updates its "seg" to "tail"; + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * generation, updates its "gen" to "old" and resets its "seg" to "default"; + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * young generation, updates its "gen" to "young" and resets its "seg" to + * "default". + * + * The events that trigger the above operations are: + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; + * 2. The first attempt to reclaim an memcg below low, which triggers + * MEMCG_LRU_TAIL; + * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + */ +#define MEMCG_NR_GENS 2 +#define MEMCG_NR_BINS 8 + +struct lru_gen_memcg { + /* the per-node memcg generation counter */ + unsigned long seq; + /* each memcg has one lru_gen_folio per node */ + unsigned long nr_memcgs[MEMCG_NR_GENS]; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; + /* protects the above */ + spinlock_t lock; +}; + +void lru_gen_init_pgdat(struct pglist_data *pgdat); + void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); -#endif +void lru_gen_online_memcg(struct mem_cgroup *memcg); +void lru_gen_offline_memcg(struct mem_cgroup *memcg); +void lru_gen_release_memcg(struct mem_cgroup *memcg); +void lru_gen_soft_reclaim(struct lruvec *lruvec); + +#else /* !CONFIG_MEMCG */ + +#define MEMCG_NR_GENS 1 + +struct lru_gen_memcg { +}; + +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + +#endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } @@ -494,6 +574,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -501,7 +582,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } -#endif + +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ +} + +#endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ @@ -524,7 +622,7 @@ struct lruvec { unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; + struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif @@ -1218,7 +1316,9 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ - struct lru_gen_mm_walk mm_walk; + struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c8cfa467b6b8..7efaa82fc367 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -146,16 +146,22 @@ struct maple_subtree_state { struct maple_big_node *bn; }; +#ifdef CONFIG_KASAN_STACK +/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ +#define noinline_for_kasan noinline_for_stack +#else +#define noinline_for_kasan inline +#endif + /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { - return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); + return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { - return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, - nodes); + return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) @@ -1126,9 +1132,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); + unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ - if (unlikely(!total)) + if (WARN_ON(!total)) return NULL; if (total == 1) { @@ -1138,27 +1145,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) goto single_node; } - if (!node->node_count) { + if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; - node->slot[0] = NULL; mas->alloc->total = node->total - 1; ret = node; goto new_head; } - node->total--; - ret = node->slot[node->node_count]; - node->slot[node->node_count--] = NULL; + ret = node->slot[--node->node_count]; + node->slot[node->node_count] = NULL; single_node: new_head: - ret->total = 0; - ret->node_count = 0; - if (ret->request_count) { - mas_set_alloc_req(mas, ret->request_count + 1); - ret->request_count = 0; + if (req) { + req++; + mas_set_alloc_req(mas, req); } + + memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } @@ -1177,21 +1182,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) unsigned long count; unsigned int requested = mas_alloc_req(mas); - memset(reuse, 0, sizeof(*reuse)); count = mas_allocated(mas); - if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { - if (head->slot[0]) - head->node_count++; - head->slot[head->node_count] = reuse; + reuse->request_count = 0; + reuse->node_count = 0; + if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { + head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->total = 1; if ((head) && !((unsigned long)head & 0x1)) { - head->request_count = 0; reuse->slot[0] = head; + reuse->node_count = 1; reuse->total += head->total; } @@ -1210,7 +1214,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); - unsigned long success = allocated; unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; @@ -1226,24 +1229,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) WARN_ON(!allocated); } - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { + if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; - if (allocated) + if (allocated) { node->slot[0] = mas->alloc; + node->node_count = 1; + } else { + node->node_count = 0; + } - success++; mas->alloc = node; + node->total = ++allocated; requested--; } node = mas->alloc; + node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS; - if (node->slot[0]) { - unsigned int offset = node->node_count + 1; + if (node->node_count) { + unsigned int offset = node->node_count; slots = (void **)&node->slot[offset]; max_req -= offset; @@ -1257,15 +1265,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) goto nomem_bulk; node->node_count += count; - /* zero indexed. */ - if (slots == (void **)&node->slot) - node->node_count--; - - success += count; + allocated += count; node = node->slot[0]; + node->node_count = 0; + node->request_count = 0; requested -= count; } - mas->alloc->total = success; + mas->alloc->total = allocated; return; nomem_bulk: @@ -1274,7 +1280,7 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = success; + mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } @@ -1330,7 +1336,7 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max, depth, and offset to + * If mas->node == MAS_START, then set the min, max and depth to * defaults. * * Return: @@ -1344,22 +1350,22 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) if (likely(mas_is_start(mas))) { struct maple_enode *root; - mas->node = MAS_NONE; mas->min = 0; mas->max = ULONG_MAX; mas->depth = 0; - mas->offset = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); + mas->offset = 0; return NULL; } /* empty tree */ if (unlikely(!root)) { + mas->node = MAS_NONE; mas->offset = MAPLE_NODE_SLOTS; return NULL; } @@ -2108,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, * * Return: The actual end of the data stored in @b_node */ -static inline void mas_store_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; @@ -3580,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, * @b_node: The maple big node * @end: The end of the data. */ -static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; @@ -5580,8 +5586,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, /* * mte_destroy_walk() - Free a tree or sub-tree. - * @enode - the encoded maple node (maple_enode) to start - * @mn - the tree to free - needed for node types. + * @enode: the encoded maple node (maple_enode) to start + * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ @@ -5701,12 +5707,11 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state - * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, gfp_t gfp) { int ret; @@ -5734,6 +5739,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; + unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, @@ -5756,14 +5762,20 @@ void mas_destroy(struct ma_state *mas) } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); - while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { + total = mas_allocated(mas); + while (total) { node = mas->alloc; mas->alloc = node->slot[0]; - if (node->node_count > 0) - mt_free_bulk(node->node_count, - (void __rcu **)&node->slot[1]); + if (node->node_count > 1) { + size_t count = node->node_count - 1; + + mt_free_bulk(count, (void __rcu **)&node->slot[1]); + total -= count; + } kmem_cache_free(maple_node_cache, node); + total--; } + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); diff --git a/mm/fadvise.c b/mm/fadvise.c index c76ee665355a..2ba24d865bf5 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; + file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: @@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: + spin_lock(&file->f_lock); + file->f_mode |= FMODE_NOREUSE; + spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: __filemap_fdatawrite_range(mapping, offset, endbyte, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e8f1ad0fe9d..5397aeb43986 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; + if (lru_gen_enabled()) { + if (soft_limit_excess(memcg)) + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); + return; + } + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; @@ -3522,6 +3528,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, struct mem_cgroup_tree_per_node *mctz; unsigned long excess; + if (lru_gen_enabled()) + return 0; + if (order > 0) return 0; @@ -5382,6 +5391,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + lru_gen_online_memcg(memcg); return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -5413,6 +5423,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); + lru_gen_offline_memcg(memcg); drain_all_stock(memcg); @@ -5424,6 +5435,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + lru_gen_release_memcg(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/memory.c b/mm/memory.c index f6f93e5b6b02..b27ed6a999b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1435,8 +1435,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, force_flush = 1; set_page_dirty(page); } - if (pte_young(ptent) && - likely(!(vma->vm_flags & VM_SEQ_READ))) + if (pte_young(ptent) && likely(vma_has_recency(vma))) mark_page_accessed(page); } rss[mm_counter(page)]--; @@ -5156,8 +5155,8 @@ static inline void mm_account_fault(struct pt_regs *regs, #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { - /* the LRU algorithm doesn't apply to sequential or random reads */ - current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); + /* the LRU algorithm only applies to accesses with recency */ + current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) diff --git a/mm/mmap.c b/mm/mmap.c index 177714886849..fa3c2d2041fd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -472,7 +472,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; if (vma->vm_file) { @@ -538,7 +538,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -712,7 +712,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -1938,7 +1938,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2019,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2311,7 +2311,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) return -ENOMEM; mas->last = end - 1; @@ -2680,7 +2680,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2953,7 +2953,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) return -ENOMEM; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); diff --git a/mm/nommu.c b/mm/nommu.c index 5b83938ecb67..aaa0efdb3e0e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -602,7 +602,7 @@ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -633,7 +633,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) { MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -1081,7 +1081,7 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) goto error_maple_preallocate; region->vm_usage = 1; @@ -1359,7 +1359,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); goto err_mas_preallocate; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b2877a84ed19..cb6cad0aa833 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7940,6 +7940,7 @@ static void __init free_area_init_node(int nid) pgdat_set_deferred_range(pgdat); free_area_init_core(pgdat); + lru_gen_init_pgdat(pgdat); } static void __init free_area_init_memoryless_node(int nid) diff --git a/mm/rmap.c b/mm/rmap.c index 7da2d8d097d9..825dac3caa1e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte) && - !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + if (lru_gen_enabled() && pte_young(*pvmw.pte)) { lru_gen_look_around(&pvmw); referenced++; } if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* - * Don't treat a reference through - * a sequentially read mapping as such. - * If the folio has been used in another mapping, - * we will catch it; if this other mapping is - * already gone, the unmap path will have set - * the referenced flag or activated the folio. - */ - if (likely(!(vma->vm_flags & VM_SEQ_READ))) - referenced++; - } + pvmw.pte)) + referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; - if (!mm_match_cgroup(vma->vm_mm, memcg)) + /* + * Ignore references from this mapping if it has no recency. If the + * folio has been used in another mapping, we will catch it; if this + * other mapping is already gone, the unmap path will have set the + * referenced flag or activated the folio in zap_pte_range(). + */ + if (!vma_has_recency(vma)) + return true; + + /* + * If we are reclaiming on behalf of a cgroup, skip counting on behalf + * of references from different cgroups. + */ + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; @@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, + .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; @@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked, return 1; } - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg) { - rwc.invalid_vma = invalid_folio_referenced_vma; - } - rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; diff --git a/mm/vmscan.c b/mm/vmscan.c index f50a427e15fc..3216e0b29f37 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -134,12 +135,6 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; -#ifdef CONFIG_LRU_GEN - /* help kswapd make better choices among multiple memcgs */ - unsigned int memcgs_need_aging:1; - unsigned long last_reclaimed; -#endif - /* Allocation order */ s8 order; @@ -452,6 +447,11 @@ static bool cgroup_reclaim(struct scan_control *sc) return sc->target_mem_cgroup; } +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); +} + /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question @@ -502,6 +502,11 @@ static bool cgroup_reclaim(struct scan_control *sc) return false; } +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + static bool writeback_throttling_sane(struct scan_control *sc) { return true; @@ -3145,6 +3150,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) + static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -3153,7 +3161,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) if (memcg) { struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - /* for hotadd_new_pgdat() */ + /* see the comment in mem_cgroup_lruvec() */ if (!lruvec->pgdat) lruvec->pgdat = pgdat; @@ -3162,7 +3170,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); - return pgdat ? &pgdat->__lruvec : NULL; + return &pgdat->__lruvec; } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) @@ -3170,6 +3178,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + if (!sc->may_swap) + return 0; + if (!can_demote(pgdat->node_id, sc) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -3184,12 +3195,104 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_struct */ + /* see the comment on lru_gen_folio */ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; } +/****************************************************************************** + * Bloom filters + ******************************************************************************/ + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + /****************************************************************************** * mm_struct list ******************************************************************************/ @@ -3226,9 +3329,6 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* the first addition since the last iteration */ if (lruvec->mm_state.tail == &mm_list->fifo) lruvec->mm_state.tail = &mm->lru_gen.list; @@ -3258,9 +3358,6 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* where the last iteration ended (exclusive) */ if (lruvec->mm_state.tail == &mm->lru_gen.list) lruvec->mm_state.tail = lruvec->mm_state.tail->next; @@ -3315,94 +3412,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif -/* - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of - * bits in a bitmap, k is the number of hash functions and n is the number of - * inserted items. - * - * Page table walkers use one of the two filters to reduce their search space. - * To get rid of non-leaf entries that no longer have enough leaf entries, the - * aging uses the double-buffering technique to flip to the other filter each - * time it produces a new generation. For non-leaf entries that have enough - * leaf entries, the aging carries them over to the next generation in - * walk_pmd_range(); the eviction also report them when walking the rmap - * in lru_gen_look_around(). - * - * For future optimizations: - * 1. It's not necessary to keep both filters all the time. The spare one can be - * freed after the RCU grace period and reallocated if needed again. - * 2. And when reallocating, it's worth scaling its size according to the number - * of inserted entries in the other filter, to reduce the memory overhead on - * small systems and false positives on large systems. - * 3. Jenkins' hash function is an alternative to Knuth's. - */ -#define BLOOM_FILTER_SHIFT 15 - -static inline int filter_gen_from_seq(unsigned long seq) -{ - return seq % NR_BLOOM_FILTERS; -} - -static void get_item_key(void *item, int *key) -{ - u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); - - BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); - - key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); - key[1] = hash >> BLOOM_FILTER_SHIFT; -} - -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -{ - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = lruvec->mm_state.filters[gen]; - if (filter) { - bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); - return; - } - - filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -} - -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return; - - get_item_key(item, key); - - if (!test_bit(key[0], filter)) - set_bit(key[0], filter); - if (!test_bit(key[1], filter)) - set_bit(key[1], filter); -} - -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return true; - - get_item_key(item, key); - - return test_bit(key[0], filter) && test_bit(key[1], filter); -} - static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; @@ -3559,7 +3568,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) } /****************************************************************************** - * refault feedback loop + * PID controller ******************************************************************************/ /* @@ -3590,7 +3599,7 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); pos->refaulted = lrugen->avg_refaulted[type][tier] + @@ -3605,7 +3614,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) { int hist, tier; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; @@ -3682,7 +3691,7 @@ static int folio_update_gen(struct folio *folio, int gen) static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int type = folio_is_file_lru(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); unsigned long new_flags, old_flags = READ_ONCE(folio->flags); @@ -3727,7 +3736,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) { int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; @@ -3760,7 +3769,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal if (is_vm_hugetlb_page(vma)) return true; - if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + if (!vma_has_recency(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) return true; if (vma == get_gate_vma(vma->vm_mm)) @@ -3955,8 +3967,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; pmd_t *pmd; @@ -3969,18 +3981,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area VM_WARN_ON_ONCE(pud_leaf(*pud)); /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ - if (*start == -1) { - *start = next; + if (*first == -1) { + *first = addr; + bitmap_zero(bitmap, MIN_LRU_BATCH); return; } - i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); if (i && i <= MIN_LRU_BATCH) { __set_bit(i - 1, bitmap); return; } - pmd = pmd_offset(pud, *start); + pmd = pmd_offset(pud, *first); ptl = pmd_lockptr(args->mm, pmd); if (!spin_trylock(ptl)) @@ -3991,15 +4004,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area do { unsigned long pfn; struct folio *folio; - unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + /* don't round down the first address */ + addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; pfn = get_pmd_pfn(pmd[i], vma, addr); if (pfn == -1) goto next; if (!pmd_trans_huge(pmd[i])) { - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -4028,12 +4042,11 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: - *start = -1; - bitmap_zero(bitmap, MIN_LRU_BATCH); + *first = -1; } #else -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { } #endif @@ -4046,9 +4059,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, unsigned long next; unsigned long addr; struct vm_area_struct *vma; - unsigned long pos = -1; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4090,18 +4103,17 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { if (!pmd_young(val)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) @@ -4118,7 +4130,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); } - walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) goto restart; @@ -4208,7 +4220,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ } while (err == -EAGAIN); } -static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; @@ -4216,7 +4228,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) VM_WARN_ON_ONCE(walk); walk = &pgdat->mm_walk; - } else if (!pgdat && !walk) { + } else if (!walk && force_alloc) { VM_WARN_ON_ONCE(current_is_kswapd()); walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -4244,7 +4256,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) { int zone; int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); if (type == LRU_GEN_ANON && !can_swap) @@ -4252,7 +4264,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) /* prevent cold/hot inversion if force_scan is true */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { - struct list_head *head = &lrugen->lists[old_gen][type][zone]; + struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); @@ -4263,7 +4275,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); new_gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); + list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); if (!--remaining) return false; @@ -4280,7 +4292,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) { int gen, type, zone; bool success = false; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -4291,7 +4303,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) gen = lru_gen_from_seq(min_seq[type]); for (zone = 0; zone < MAX_NR_ZONES; zone++) { - if (!list_empty(&lrugen->lists[gen][type][zone])) + if (!list_empty(&lrugen->folios[gen][type][zone])) goto next; } @@ -4301,7 +4313,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) ; } - /* see the comment on lru_gen_struct */ + /* see the comment on lru_gen_folio */ if (can_swap) { min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); @@ -4323,7 +4335,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) { int prev, next; int type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; spin_lock_irq(&lruvec->lru_lock); @@ -4381,7 +4393,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool success; struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); @@ -4397,12 +4409,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ - if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; } - walk = set_mm_walk(NULL); + walk = set_mm_walk(NULL, true); if (!walk) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; @@ -4425,8 +4437,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, if (sc->priority <= DEF_PRIORITY - 2) wait_event_killable(lruvec->mm_state.wait, max_seq < READ_ONCE(lrugen->max_seq)); - - return max_seq < READ_ONCE(lrugen->max_seq); + return false; } VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); @@ -4439,98 +4450,57 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, return true; } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, - struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +/****************************************************************************** + * working set protection + ******************************************************************************/ + +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; unsigned long total = 0; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + bool can_swap = get_swappiness(lruvec, sc); + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); for (type = !can_swap; type < ANON_AND_FILE; type++) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) - size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; + total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); } } - /* try to scrape all its memory if this memcg was deleted */ - *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) - return true; - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + /* whether the size is big enough to be helpful */ + return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; } -static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, + unsigned long min_ttl) { - bool need_aging; - unsigned long nr_to_scan; - int swappiness = get_swappiness(lruvec, sc); + int gen; + unsigned long birth; struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - VM_WARN_ON_ONCE(sc->memcg_low_reclaim); - - mem_cgroup_calculate_protection(NULL, memcg); + /* see the comment on lru_gen_folio */ + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - if (mem_cgroup_below_min(memcg)) + if (time_is_after_jiffies(birth + min_ttl)) return false; - need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + if (!lruvec_is_sizable(lruvec, sc)) + return false; - if (min_ttl) { - int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); - unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + mem_cgroup_calculate_protection(NULL, memcg); - if (time_is_after_jiffies(birth + min_ttl)) - return false; - - /* the size is likely too small to be helpful */ - if (!nr_to_scan && sc->priority != DEF_PRIORITY) - return false; - } - - if (need_aging) - try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); - - return true; -} + return !mem_cgroup_below_min(memcg); +} /* to protect the working set of the last N jiffies */ #ifdef CONFIG_CACHY @@ -4542,46 +4512,30 @@ static unsigned long lru_gen_min_ttl __read_mostly; static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; - bool success = false; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); VM_WARN_ON_ONCE(!current_is_kswapd()); - sc->last_reclaimed = sc->nr_reclaimed; - - /* - * To reduce the chance of going into the aging path, which can be - * costly, optimistically skip it if the flag below was cleared in the - * eviction path. This improves the overall performance when multiple - * memcgs are available. - */ - if (!sc->memcgs_need_aging) { - sc->memcgs_need_aging = true; + /* check the order to exclude compaction-induced reclaim */ + if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; - } - - set_mm_walk(pgdat); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - if (age_lruvec(lruvec, sc, min_ttl)) - success = true; + if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { + mem_cgroup_iter_break(NULL, memcg); + return; + } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); - clear_mm_walk(); - - /* check the order to exclude compaction-induced reclaim */ - if (success || !min_ttl || sc->order) - return; - /* * The main goal is to OOM kill if every generation from all memcgs is * younger than min_ttl. However, another possibility is all memcgs are - * either below min or empty. + * either too small or below min. */ if (mutex_trylock(&oom_lock)) { struct oom_control oc = { @@ -4594,6 +4548,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } } +/****************************************************************************** + * rmap/PT walk feedback + ******************************************************************************/ + /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If @@ -4604,13 +4562,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; - pte_t *pte; unsigned long start; unsigned long end; - unsigned long addr; struct lru_gen_mm_walk *walk; int young = 0; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -4627,25 +4584,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); - end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + start = max(addr & PMD_MASK, pvmw->vma->vm_start); + end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { - if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; - else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) start = end - MIN_LRU_BATCH * PAGE_SIZE; else { - start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; - end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; } } - pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; - rcu_read_lock(); arch_enter_lazy_mmu_mode(); + pte -= (addr - start) / PAGE_SIZE; + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -4670,58 +4630,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) !folio_test_swapcache(folio))) folio_mark_dirty(folio); + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + + continue; + } + old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); else if (old_gen != new_gen) - __set_bit(i, bitmap); + folio_activate(folio); } arch_leave_lazy_mmu_mode(); - rcu_read_unlock(); + mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ if (suitable_to_scan(i, young)) update_bloom_filter(lruvec, max_seq, pvmw->pmd); +} - if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - folio_activate(folio); - } - return; +/****************************************************************************** + * memcg LRU + ******************************************************************************/ + +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + +#ifdef CONFIG_MEMCG + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} + +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = prandom_u32_max(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = prandom_u32_max(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); } +} - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - return; +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; - if (!walk) { - spin_lock_irq(&lruvec->lru_lock); - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); } +} - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - if (folio_memcg_rcu(folio) != memcg) - continue; +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; - old_gen = folio_update_gen(folio, new_gen); - if (old_gen < 0 || old_gen == new_gen) - continue; + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); - if (walk) - update_batch_size(walk, folio, old_gen, new_gen); - else - lru_gen_update_size(lruvec, folio, old_gen, new_gen); + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); } +} - if (!walk) - spin_unlock_irq(&lruvec->lru_lock); +void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ + /* see the comment on MEMCG_NR_GENS */ + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); +} - mem_cgroup_unlock_pages(); +#else /* !CONFIG_MEMCG */ + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; } +#endif + /****************************************************************************** * the eviction ******************************************************************************/ @@ -4735,7 +4808,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4760,7 +4833,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { - list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -4769,7 +4842,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) int hist = lru_hist_from_seq(lrugen->min_seq[type]); gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); WRITE_ONCE(lrugen->protected[hist][type][tier - 1], lrugen->protected[hist][type][tier - 1] + delta); @@ -4781,7 +4854,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) if (folio_test_locked(folio) || folio_test_writeback(folio) || (type == LRU_GEN_FILE && folio_test_dirty(folio))) { gen = folio_inc_gen(lruvec, folio, true); - list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -4792,12 +4865,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* unmapping inhibited */ - if (!sc->may_unmap && folio_mapped(folio)) - return false; - /* swapping inhibited */ - if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) return false; @@ -4835,7 +4904,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int scanned = 0; int isolated = 0; int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); @@ -4848,7 +4917,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, for (zone = sc->reclaim_idx; zone >= 0; zone--) { LIST_HEAD(moved); int skipped = 0; - struct list_head *head = &lrugen->lists[gen][type][zone]; + struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); @@ -4894,9 +4963,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_vm_events(PGSCAN_ANON + type, isolated); /* - * There might not be eligible pages due to reclaim_idx, may_unmap and - * may_writepage. Check the remaining to prevent livelock if it's not - * making progress. + * There might not be eligible folios due to reclaim_idx. Check the + * remaining to prevent livelock if it's not making progress. */ return isolated || !remaining ? scanned : 0; } @@ -4991,8 +5059,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return scanned; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, - bool *need_swapping) +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { int type; int scanned; @@ -5081,152 +5148,348 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap goto retry; } - if (need_swapping && type == LRU_GEN_ANON) - *need_swapping = true; - return scanned; } +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + + /* whether this lruvec is completely out of cold folios */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { + *nr_to_scan = 0; + return true; + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + /* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap, bool *need_aging) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - if (mem_cgroup_below_min(memcg) || - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(memcg)) return 0; - *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); - if (!*need_aging) + if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) return nr_to_scan; /* skip the aging path at the default priority */ if (sc->priority == DEF_PRIORITY) - goto done; + return nr_to_scan; - /* leave the work to lru_gen_age_node() */ - if (current_is_kswapd()) - return 0; + /* skip this lruvec as it's low on cold folios */ + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; +} - if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) - return nr_to_scan; -done: - return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +static unsigned long get_nr_to_reclaim(struct scan_control *sc) +{ + /* don't abort memcg reclaim to ensure fairness */ + if (!global_reclaim(sc)) + return -1; + + return max(sc->nr_to_reclaim, compact_gap(sc->order)); } -static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, - struct scan_control *sc, bool need_swapping) +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - int i; - DEFINE_MAX_SEQ(lruvec); + long nr_to_scan; + unsigned long scanned = 0; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + int swappiness = get_swappiness(lruvec, sc); - if (!current_is_kswapd()) { - /* age each memcg at most once to ensure fairness */ - if (max_seq - seq > 1) - return true; + /* clean file folios are more likely to exist */ + if (swappiness && !(sc->gfp_mask & __GFP_IO)) + swappiness = 1; - /* over-swapping can increase allocation latency */ - if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) - return true; + while (true) { + int delta; - /* give this thread a chance to exit and free its memory */ - if (fatal_signal_pending(current)) { - sc->nr_reclaimed += MIN_LRU_BATCH; - return true; - } + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (nr_to_scan <= 0) + break; - if (cgroup_reclaim(sc)) - return false; - } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) - return false; + delta = evict_folios(lruvec, sc, swappiness); + if (!delta) + break; - /* keep scanning at low priorities to ensure fairness */ - if (sc->priority > DEF_PRIORITY - 2) - return false; + scanned += delta; + if (scanned >= nr_to_scan) + break; - /* - * A minimum amount of work was done under global memory pressure. For - * kswapd, it may be overshooting. For direct reclaim, the allocation - * may succeed if all suitable zones are somewhat safe. In either case, - * it's better to stop now, and restart later if necessary. - */ - for (i = 0; i <= sc->reclaim_idx; i++) { - unsigned long wmark; - struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + if (sc->nr_reclaimed >= nr_to_reclaim) + break; - if (!managed_zone(zone)) + cond_resched(); + } + + /* whether try_to_inc_max_seq() was successful */ + return nr_to_scan < 0; +} + +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + unsigned long scanned = sc->nr_scanned; + unsigned long reclaimed = sc->nr_reclaimed; + int seg = lru_gen_memcg_seg(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + /* see the comment on MEMCG_NR_GENS */ + if (!lruvec_is_sizable(lruvec, sc)) + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) + return MEMCG_LRU_YOUNG; + + if (mem_cgroup_below_low(memcg)) { + /* see the comment on MEMCG_NR_GENS */ + if (seg != MEMCG_LRU_TAIL) + return MEMCG_LRU_TAIL; + + memcg_memory_event(memcg, MEMCG_LOW); + } + + success = try_to_shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; + current->reclaim_state->reclaimed_slab = 0; + + return success ? MEMCG_LRU_YOUNG : 0; +} + +#ifdef CONFIG_MEMCG + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + int op; + int gen; + int bin; + int first_bin; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + struct mem_cgroup *memcg; + const struct hlist_nulls_node *pos; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + bin = first_bin = prandom_u32_max(MEMCG_NR_BINS); +restart: + op = 0; + memcg = NULL; + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + + rcu_read_lock(); + + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + lruvec = container_of(lrugen, struct lruvec, lrugen); + memcg = lruvec_memcg(lruvec); + + if (!mem_cgroup_tryget(memcg)) { + op = 0; + memcg = NULL; continue; + } - wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); - if (wmark > zone_page_state(zone, NR_FREE_PAGES)) - return false; + rcu_read_unlock(); + + op = shrink_one(lruvec, sc); + + rcu_read_lock(); + + if (sc->nr_reclaimed >= nr_to_reclaim) + break; } - sc->nr_reclaimed += MIN_LRU_BATCH; + rcu_read_unlock(); - return true; + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + if (sc->nr_reclaimed >= nr_to_reclaim) + return; + + /* restart if raced with lru_gen_rotate_memcg() */ + if (gen != get_nulls_value(pos)) + goto restart; + + /* try the rest of the bins of the current generation */ + bin = get_memcg_bin(bin + 1); + if (bin != first_bin) + goto restart; } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; - bool need_aging = false; - bool need_swapping = false; - unsigned long scanned = 0; - unsigned long reclaimed = sc->nr_reclaimed; - DEFINE_MAX_SEQ(lruvec); + + VM_WARN_ON_ONCE(global_reclaim(sc)); + VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); blk_start_plug(&plug); - set_mm_walk(lruvec_pgdat(lruvec)); + set_mm_walk(NULL, sc->proactive); - while (true) { - int delta; - int swappiness; - unsigned long nr_to_scan; + if (try_to_shrink_lruvec(lruvec, sc)) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); - if (sc->may_swap) - swappiness = get_swappiness(lruvec, sc); - else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) - swappiness = 1; - else - swappiness = 0; + clear_mm_walk(); - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); - if (!nr_to_scan) - goto done; + blk_finish_plug(&plug); +} - delta = evict_folios(lruvec, sc, swappiness, &need_swapping); - if (!delta) - goto done; +#else /* !CONFIG_MEMCG */ - scanned += delta; - if (scanned >= nr_to_scan) - break; +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + BUILD_BUG(); +} - if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) - break; +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + BUILD_BUG(); +} - cond_resched(); - } +#endif + +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + struct lruvec *lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the + * estimated reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_swappiness(lruvec, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + reclaimable /= MEMCG_NR_GENS; + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + sc->priority = clamp(priority, 0, DEF_PRIORITY); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long reclaimed = sc->nr_reclaimed; + + VM_WARN_ON_ONCE(!global_reclaim(sc)); + + /* + * Unmapped clean folios are already prioritized. Scanning for more of + * them is likely futile and can cause high reclaim latency when there + * is a large number of memcgs. + */ + if (!sc->may_writepage || !sc->may_unmap) + return; + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(pgdat, sc->proactive); + + set_initial_priority(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed = 0; + + if (mem_cgroup_disabled()) + shrink_one(&pgdat->__lruvec, sc); + else + shrink_many(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed += reclaimed; - /* see the comment in lru_gen_age_node() */ - if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) - sc->memcgs_need_aging = false; -done: clear_mm_walk(); blk_finish_plug(&plug); + + /* kswapd should never fail */ + pgdat->kswapd_failures = 0; } /****************************************************************************** @@ -5235,7 +5498,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc static bool __maybe_unused state_is_valid(struct lruvec *lruvec) { - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; if (lrugen->enabled) { enum lru_list lru; @@ -5248,7 +5511,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) int gen, type, zone; for_each_gen_type_zone(gen, type, zone) { - if (!list_empty(&lrugen->lists[gen][type][zone])) + if (!list_empty(&lrugen->folios[gen][type][zone])) return false; } } @@ -5293,7 +5556,7 @@ static bool drain_evictable(struct lruvec *lruvec) int remaining = MAX_LRU_BATCH; for_each_gen_type_zone(gen, type, zone) { - struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; + struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; while (!list_empty(head)) { bool success; @@ -5342,9 +5605,6 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -5374,14 +5634,14 @@ static void lru_gen_change_state(bool enabled) * sysfs interface ******************************************************************************/ -static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ -static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) { unsigned int msecs; @@ -5393,11 +5653,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, return len; } -static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( - min_ttl_ms, 0644, show_min_ttl, store_min_ttl -); +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); -static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int caps = 0; @@ -5414,7 +5672,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ -static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, +static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { int i; @@ -5441,9 +5699,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, return len; } -static struct kobj_attribute lru_gen_enabled_attr = __ATTR( - enabled, 0644, show_enabled, store_enabled -); +static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); static struct attribute *lru_gen_attrs[] = { &lru_gen_min_ttl_attr.attr, @@ -5451,7 +5707,7 @@ static struct attribute *lru_gen_attrs[] = { NULL }; -static struct attribute_group lru_gen_attr_group = { +static const struct attribute_group lru_gen_attr_group = { .name = "lru_gen", .attrs = lru_gen_attrs, }; @@ -5517,7 +5773,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int i; int type, tier; int hist = lru_hist_from_seq(seq); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); @@ -5567,7 +5823,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) unsigned long seq; bool full = !debugfs_real_fops(m->file)->write; struct lruvec *lruvec = v; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); @@ -5664,7 +5920,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness, NULL)) + if (!evict_folios(lruvec, sc, swappiness)) return 0; cond_resched(); @@ -5685,11 +5941,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (!mem_cgroup_disabled()) { rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); -#ifdef CONFIG_MEMCG - if (memcg && !css_tryget(&memcg->css)) + if (!mem_cgroup_tryget(memcg)) memcg = NULL; -#endif + rcu_read_unlock(); if (!memcg) @@ -5749,7 +6005,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, set_task_reclaim_state(current, &sc.reclaim_state); flags = memalloc_noreclaim_save(); blk_start_plug(&plug); - if (!set_mm_walk(NULL)) { + if (!set_mm_walk(NULL, true)) { err = -ENOMEM; goto done; } @@ -5821,7 +6077,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); @@ -5830,13 +6086,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) lrugen->timestamps[i] = jiffies; for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); lruvec->mm_state.seq = MIN_NR_GENS; init_waitqueue_head(&lruvec->mm_state.wait); } #ifdef CONFIG_MEMCG + +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + void lru_gen_init_memcg(struct mem_cgroup *memcg) { INIT_LIST_HEAD(&memcg->mm_list.fifo); @@ -5848,19 +6117,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) int i; int nid; + VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); + lruvec->lrugen.list.next = LIST_POISON1; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(lruvec->mm_state.filters[i]); lruvec->mm_state.filters[i] = NULL; } } } -#endif + +#endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { @@ -5887,6 +6162,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { } +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -5900,7 +6179,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !global_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } @@ -6143,6 +6422,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; + if (lru_gen_enabled() && global_reclaim(sc)) { + lru_gen_shrink_node(pgdat, sc); + return; + } + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: diff --git a/mm/workingset.c b/mm/workingset.c index ae7e984b23c6..688aaa73f64e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); @@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; struct pglist_data *pgdat; int type = folio_is_file_lru(folio); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 2e91973fbaa6..6cc958ba7480 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -172,11 +172,11 @@ static noinline void check_new_node(struct maple_tree *mt) if (!MAPLE_32BIT) { if (i >= 35) - e = i - 35; + e = i - 34; else if (i >= 5) - e = i - 5; + e = i - 4; else if (i >= 2) - e = i - 2; + e = i - 1; } else { if (i >= 4) e = i - 4; @@ -304,17 +304,17 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); /* Check the limit of pop/push/pop */ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ @@ -322,14 +322,14 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); @@ -35341,7 +35341,7 @@ static noinline void check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35350,18 +35350,18 @@ static noinline void check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35369,25 +35369,25 @@ static noinline void check_prealloc(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35396,12 +35396,12 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35409,21 +35409,21 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35431,14 +35431,14 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35446,7 +35446,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); -- 2.40.0 From 69f5c21cf8535018187d8e76ab856a1e839b41b7 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sat, 7 Jan 2023 11:53:10 +0100 Subject: [PATCH 12/17] perf Signed-off-by: Peter Jung --- arch/arm64/kernel/perf_event.c | 18 +- arch/powerpc/perf/core-book3s.c | 8 +- arch/s390/kernel/perf_pai_crypto.c | 2 +- arch/s390/kernel/perf_pai_ext.c | 2 +- arch/x86/events/amd/brs.c | 2 +- arch/x86/events/amd/ibs.c | 4 +- arch/x86/events/amd/lbr.c | 6 +- arch/x86/events/core.c | 48 +- arch/x86/events/intel/core.c | 23 +- arch/x86/events/intel/ds.c | 4 +- arch/x86/events/intel/lbr.c | 30 +- arch/x86/events/perf_event.h | 31 +- drivers/perf/arm_pmu.c | 16 +- include/linux/perf/arm_pmu.h | 2 +- include/linux/perf_event.h | 125 +- include/linux/sched.h | 2 +- kernel/events/core.c | 2112 ++++++++++++++-------------- 17 files changed, 1273 insertions(+), 1162 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 7b0643fe2f13..54186542a969 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -806,10 +806,14 @@ static void armv8pmu_disable_event(struct perf_event *event) static void armv8pmu_start(struct arm_pmu *cpu_pmu) { - struct perf_event_context *task_ctx = - this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx; + struct perf_event_context *ctx; + int nr_user = 0; - if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user) + ctx = perf_cpu_task_ctx(); + if (ctx) + nr_user = ctx->nr_user; + + if (sysctl_perf_user_access && nr_user) armv8pmu_enable_user_access(cpu_pmu); else armv8pmu_disable_user_access(); @@ -1019,10 +1023,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, return 0; } -static int armv8pmu_filter_match(struct perf_event *event) +static bool armv8pmu_filter(struct pmu *pmu, int cpu) { - unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT; - return evtype != ARMV8_PMUV3_PERFCTR_CHAIN; + struct arm_pmu *armpmu = to_arm_pmu(pmu); + return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus); } static void armv8pmu_reset(void *info) @@ -1253,7 +1257,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, cpu_pmu->stop = armv8pmu_stop; cpu_pmu->reset = armv8pmu_reset; cpu_pmu->set_event_filter = armv8pmu_set_event_filter; - cpu_pmu->filter_match = armv8pmu_filter_match; + cpu_pmu->filter = armv8pmu_filter; cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 942aa830e110..bf318dd9b709 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -132,7 +132,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw) static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} -static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {} +static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {} static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } #endif /* CONFIG_PPC32 */ @@ -424,7 +424,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event) cpuhw->bhrb_context = event->ctx; } cpuhw->bhrb_users++; - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); } static void power_pmu_bhrb_disable(struct perf_event *event) @@ -436,7 +436,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event) WARN_ON_ONCE(!cpuhw->bhrb_users); cpuhw->bhrb_users--; - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); if (!cpuhw->disabled && !cpuhw->bhrb_users) { /* BHRB cannot be turned off when other @@ -451,7 +451,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event) /* Called from ctxsw to prevent one process's branch entries to * mingle with the other process's entries during context switch. */ -static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) +static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { if (!ppmu->bhrb_nr) return; diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 6826e2a69a21..f747137f39ae 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -379,7 +379,7 @@ static int paicrypt_push_sample(void) /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event CRYPTO_ALL is allowed. */ -static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in) +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { /* We started with a clean page on event installation. So read out * results on schedule_out and if page was dirty, clear values. diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 74b53c531e0c..72e34160990c 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -472,7 +472,7 @@ static int paiext_push_sample(void) /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event NNPA_ALL is allowed. */ -static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in) +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { /* We started with a clean page on event installation. So read out * results on schedule_out and if page was dirty, clear values. diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index f1bff153d945..58461fa18b6f 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void) * On ctxswin, sched_in = true, called after the PMU has started * On ctxswout, sched_in = false, called before the PMU is stopped */ -void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4cb710efbdd9..da3f5ebac4e1 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -631,7 +631,7 @@ static const struct attribute_group *op_attr_update[] = { static struct perf_ibs perf_ibs_fetch = { .pmu = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = perf_ibs_init, .add = perf_ibs_add, @@ -655,7 +655,7 @@ static struct perf_ibs perf_ibs_fetch = { static struct perf_ibs perf_ibs_op = { .pmu = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = perf_ibs_init, .add = perf_ibs_add, diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 38a75216c12c..eb31f850841a 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = reg->reg; } - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) amd_pmu_lbr_reset(); @@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } -void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 30fb4931d387..d096b04bf80e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); +DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); + static_call_update(x86_pmu_filter, x86_pmu.filter); } static void _x86_pmu_read(struct perf_event *event) @@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, pr_info("... event mask: %016Lx\n", intel_ctrl); } -/* - * The generic code is not hybrid friendly. The hybrid_pmu->pmu - * of the first registered PMU is unconditionally assigned to - * each possible cpuctx->ctx.pmu. - * Update the correct hybrid PMU to the cpuctx->ctx.pmu. - */ -void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu) -{ - struct perf_cpu_context *cpuctx; - - if (!pmu->pmu_cpu_context) - return; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->ctx.pmu = pmu; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2175,13 +2161,9 @@ static int __init init_hw_perf_events(void) if (err) goto out2; } else { - u8 cpu_type = get_this_hybrid_cpu_type(); struct x86_hybrid_pmu *hybrid_pmu; int i, j; - if (!cpu_type && x86_pmu.get_hybrid_cpu_type) - cpu_type = x86_pmu.get_hybrid_cpu_type(); - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; @@ -2195,9 +2177,6 @@ static int __init init_hw_perf_events(void) (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; - - if (cpu_type == hybrid_pmu->cpu_type) - x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id()); } if (i < x86_pmu.num_hybrid_pmus) { @@ -2646,15 +2625,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; -static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) +static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { - static_call_cond(x86_pmu_sched_task)(ctx, sched_in); + static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); } -static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { - static_call_cond(x86_pmu_swap_task_ctx)(prev, next); + static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); } void perf_check_microcode(void) @@ -2689,12 +2668,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event) return 0; } -static int x86_pmu_filter_match(struct perf_event *event) +static bool x86_pmu_filter(struct pmu *pmu, int cpu) { - if (x86_pmu.filter_match) - return x86_pmu.filter_match(event); + bool ret = false; - return 1; + static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); + + return ret; } static struct pmu pmu = { @@ -2725,7 +2705,7 @@ static struct pmu pmu = { .aux_output_match = x86_pmu_aux_output_match, - .filter_match = x86_pmu_filter_match, + .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5a1d0ea402e4..bafdc2be479a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4536,8 +4536,6 @@ static bool init_hybrid_pmu(int cpu) cpumask_set_cpu(cpu, &pmu->supported_cpus); cpuc->pmu = &pmu->pmu; - x86_pmu_update_cpu_context(&pmu->pmu, cpu); - return true; } @@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu) cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus); } -static void intel_pmu_sched_task(struct perf_event_context *ctx, +static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { - intel_pmu_pebs_sched_task(ctx, sched_in); - intel_pmu_lbr_sched_task(ctx, sched_in); + intel_pmu_pebs_sched_task(pmu_ctx, sched_in); + intel_pmu_lbr_sched_task(pmu_ctx, sched_in); } -static void intel_pmu_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { - intel_pmu_lbr_swap_task_ctx(prev, next); + intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc); } static int intel_pmu_check_period(struct perf_event *event, u64 value) @@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event) return is_intel_pt_event(event); } -static int intel_pmu_filter_match(struct perf_event *event) +static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret) { - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - unsigned int cpu = smp_processor_id(); + struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu); - return cpumask_test_cpu(cpu, &pmu->supported_cpus); + *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus); } PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); @@ -6414,7 +6411,7 @@ __init int intel_pmu_init(void) static_call_update(intel_pmu_set_topdown_event_period, &adl_set_topdown_event_period); - x86_pmu.filter_match = intel_pmu_filter_match; + x86_pmu.filter = intel_pmu_filter; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; x86_pmu.limit_period = spr_limit_period; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3ff38e7409e3..91b214231e03 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1071,7 +1071,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1179,7 +1179,7 @@ static void pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct perf_event *event, bool add) { - struct pmu *pmu = event->ctx->pmu; + struct pmu *pmu = event->pmu; /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8259d725054d..017baba56b01 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx) cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { void *prev_ctx_data, *next_ctx_data; - swap(prev->task_ctx_data, next->task_ctx_data); + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); /* - * Architecture specific synchronization makes sense in - * case both prev->task_ctx_data and next->task_ctx_data + * Architecture specific synchronization makes sense in case + * both prev_epc->task_ctx_data and next_epc->task_ctx_data * pointers are allocated. */ - prev_ctx_data = next->task_ctx_data; - next_ctx_data = prev->task_ctx_data; + prev_ctx_data = next_epc->task_ctx_data; + next_ctx_data = prev_epc->task_ctx_data; if (!prev_ctx_data || !next_ctx_data) return; @@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, task_context_opt(next_ctx_data)->lbr_callstack_users); } -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); void *task_ctx; @@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) * the task was scheduled out, restore the stack. Otherwise flush * the LBR stack. */ - task_ctx = ctx ? ctx->task_ctx_data : NULL; + task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL; if (task_ctx) { if (sched_in) __intel_pmu_lbr_restore(task_ctx); @@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) - task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; + if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data) + task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event) */ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) cpuc->lbr_pebs_users++; - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); } @@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) - task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; + event->pmu_ctx->task_ctx_data) + task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--; if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; @@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } static inline bool vlbr_exclude_host(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 332d2e6d8ae4..0e849f28a5c1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -811,7 +811,7 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*sched_task)(struct perf_event_context *ctx, + void (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* @@ -894,12 +894,12 @@ struct x86_pmu { int num_topdown_events; /* - * perf task context (i.e. struct perf_event_context::task_ctx_data) + * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; */ - void (*swap_task_ctx)(struct perf_event_context *prev, - struct perf_event_context *next); + void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); /* * AMD bits @@ -925,7 +925,7 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); - int (*filter_match)(struct perf_event *event); + void (*filter)(struct pmu *pmu, int cpu, bool *ret); /* * Hybrid support * @@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs); void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, u64 intel_ctrl); -void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu); - extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; @@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void); void amd_pmu_lbr_read(void); void amd_pmu_lbr_add(struct perf_event *event); void amd_pmu_lbr_del(struct perf_event *event); -void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); void amd_pmu_lbr_enable_all(void); void amd_pmu_lbr_disable_all(void); int amd_pmu_lbr_hw_config(struct perf_event *event); @@ -1322,7 +1320,6 @@ void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); -void amd_brs_disable_all(void); int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); @@ -1330,7 +1327,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); cpuc->lbr_users++; /* * No need to reset BRS because it is reset @@ -1345,10 +1342,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } -void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); #else static inline int amd_brs_init(void) { @@ -1373,7 +1370,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event) { } -static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { } @@ -1533,7 +1530,7 @@ void intel_pmu_pebs_enable_all(void); void intel_pmu_pebs_disable_all(void); -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); @@ -1541,10 +1538,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next); +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 3f07df5a7e95..5ece3f132d80 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -550,15 +550,14 @@ static void armpmu_disable(struct pmu *pmu) * microarchitecture, and aren't suitable for another. Thus, only match CPUs of * the same microarchitecture. */ -static int armpmu_filter_match(struct perf_event *event) +static bool armpmu_filter(struct pmu *pmu, int cpu) { - struct arm_pmu *armpmu = to_arm_pmu(event->pmu); - unsigned int cpu = smp_processor_id(); - int ret; + struct arm_pmu *armpmu = to_arm_pmu(pmu); + bool ret; ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus); - if (ret && armpmu->filter_match) - return armpmu->filter_match(event); + if (ret && armpmu->filter) + return armpmu->filter(pmu, cpu); return ret; } @@ -885,14 +884,13 @@ static struct arm_pmu *__armpmu_alloc(gfp_t flags) .start = armpmu_start, .stop = armpmu_stop, .read = armpmu_read, - .filter_match = armpmu_filter_match, + .filter = armpmu_filter, .attr_groups = pmu->attr_groups, /* * This is a CPU PMU potentially in a heterogeneous * configuration (e.g. big.LITTLE). This is not an uncore PMU, * and we have taken ctx sharing into account (e.g. with our - * pmu::filter_match callback and pmu::event_init group - * validation). + * pmu::filter callback and pmu::event_init group validation). */ .capabilities = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS, }; diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0356cb6a215d..725968095ea9 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -100,7 +100,7 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); - int (*filter_match)(struct perf_event *event); + bool (*filter)(struct pmu *pmu, int cpu); int num_events; bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0031f7b4d9ab..c6a3bac76966 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -266,6 +266,7 @@ struct hw_perf_event { }; struct perf_event; +struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn @@ -308,7 +309,7 @@ struct pmu { int capabilities; int __percpu *pmu_disable_count; - struct perf_cpu_context __percpu *pmu_cpu_context; + struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -443,7 +444,7 @@ struct pmu { /* * context-switches callback */ - void (*sched_task) (struct perf_event_context *ctx, + void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* @@ -457,8 +458,8 @@ struct pmu { * implementation and Perf core context switch handling callbacks for usage * examples. */ - void (*swap_task_ctx) (struct perf_event_context *prev, - struct perf_event_context *next); + void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); /* optional */ /* @@ -522,9 +523,10 @@ struct pmu { /* optional */ /* - * Filter events for PMU-specific reasons. + * Skip programming this PMU on the given CPU. Typically needed for + * big.LITTLE things. */ - int (*filter_match) (struct perf_event *event); /* optional */ + bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. @@ -695,6 +697,11 @@ struct perf_event { int group_caps; struct perf_event *group_leader; + /* + * event->pmu will always point to pmu in which this event belongs. + * Whereas event->pmu_ctx->pmu may point to other pmu when group of + * different pmu events is created. + */ struct pmu *pmu; void *pmu_private; @@ -720,6 +727,12 @@ struct perf_event { struct hw_perf_event hw; struct perf_event_context *ctx; + /* + * event->pmu_ctx points to perf_event_pmu_context in which the event + * is added. This pmu_ctx can be of other pmu for sw event when that + * sw event is part of a group which also contains non-sw events. + */ + struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* @@ -812,19 +825,69 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; +/* + * ,-----------------------[1:n]----------------------. + * V V + * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event + * ^ ^ | | + * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-' + * + * + * struct perf_event_pmu_context lifetime is refcount based and RCU freed + * (similar to perf_event_context). Locking is as if it were a member of + * perf_event_context; specifically: + * + * modification, both: ctx->mutex && ctx->lock + * reading, either: ctx->mutex || ctx->lock + * + * There is one exception to this; namely put_pmu_ctx() isn't always called + * with ctx->mutex held; this means that as long as we can guarantee the epc + * has events the above rules hold. + * + * Specificially, sys_perf_event_open()'s group_leader case depends on + * ctx->mutex pinning the configuration. Since we hold a reference on + * group_leader (through the filedesc) it can't go away, therefore it's + * associated pmu_ctx must exist and cannot change due to ctx->mutex. + */ +struct perf_event_pmu_context { + struct pmu *pmu; + struct perf_event_context *ctx; + + struct list_head pmu_ctx_entry; + + struct list_head pinned_active; + struct list_head flexible_active; + + /* Used to avoid freeing per-cpu perf_event_pmu_context */ + unsigned int embedded : 1; + + unsigned int nr_events; + + atomic_t refcount; /* event <-> epc */ + struct rcu_head rcu_head; + + void *task_ctx_data; /* pmu specific data */ + /* + * Set when one or more (plausibly active) event can't be scheduled + * due to pmu overcommit or pmu constraints, except tolerant to + * events not necessary to be active due to scheduling constraints, + * such as cgroups. + */ + int rotate_necessary; +}; struct perf_event_groups { struct rb_root tree; u64 index; }; + /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { - struct pmu *pmu; /* * Protect the states of the events in the list, * nr_active, and the list: @@ -837,27 +900,21 @@ struct perf_event_context { */ struct mutex mutex; - struct list_head active_ctx_list; + struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; - struct list_head pinned_active; - struct list_head flexible_active; - int nr_events; - int nr_active; int nr_user; int is_active; + + int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; - /* - * Set when nr_events != nr_active, except tolerant to events not - * necessary to be active due to scheduling constraints, such as cgroups. - */ - int rotate_necessary; - refcount_t refcount; + + refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* @@ -878,7 +935,6 @@ struct perf_event_context { #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif - void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; /* @@ -896,12 +952,13 @@ struct perf_event_context { */ #define PERF_NR_CONTEXTS 4 -/** - * struct perf_cpu_context - per cpu event context structure - */ -struct perf_cpu_context { - struct perf_event_context ctx; - struct perf_event_context *task_ctx; +struct perf_cpu_pmu_context { + struct perf_event_pmu_context epc; + struct perf_event_pmu_context *task_epc; + + struct list_head sched_cb_entry; + int sched_cb_usage; + int active_oncpu; int exclusive; @@ -909,16 +966,20 @@ struct perf_cpu_context { struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; +}; + +/** + * struct perf_event_cpu_context - per cpu event context structure + */ +struct perf_cpu_context { + struct perf_event_context ctx; + struct perf_event_context *task_ctx; + int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; - struct list_head cgrp_cpuctx_entry; #endif - struct list_head sched_cb_entry; - int sched_cb_usage; - - int online; /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. @@ -982,6 +1043,8 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) #ifdef CONFIG_PERF_EVENTS +extern struct perf_event_context *perf_cpu_task_ctx(void); + extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, @@ -1187,7 +1250,7 @@ static inline int is_software_event(struct perf_event *event) */ static inline int in_software_context(struct perf_event *event) { - return event->ctx->pmu->task_ctx_nr == perf_sw_context; + return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..4e03f1dcbe52 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,7 +1243,7 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS - struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; + struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 3b9e86108f43..8f428ccdf5bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -155,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -184,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event) return READ_ONCE(event->owner) == TASK_TOMBSTONE; } +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +struct perf_event_context *perf_cpu_task_ctx(void) +{ + lockdep_assert_irqs_disabled(); + return this_cpu_ptr(&perf_cpu_context)->task_ctx; +} + /* * On task ctx scheduling... * @@ -217,7 +219,7 @@ static int event_function(void *info) struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; @@ -314,7 +316,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; @@ -388,7 +390,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -448,7 +449,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -571,12 +572,6 @@ void perf_sample_event_took(u64 sample_len_ns) static atomic64_t perf_event_id; -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -691,13 +686,31 @@ do { \ ___p; \ }) +static void perf_ctx_disable(struct perf_event_context *ctx) +{ + struct perf_event_pmu_context *pmu_ctx; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + perf_pmu_disable(pmu_ctx->pmu); +} + +static void perf_ctx_enable(struct perf_event_context *ctx) +{ + struct perf_event_pmu_context *pmu_ctx; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + perf_pmu_enable(pmu_ctx->pmu); +} + +static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); + #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) @@ -823,54 +836,39 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) } } -static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); - /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - struct perf_cpu_context *cpuctx, *tmp; - struct list_head *list; - unsigned long flags; - - /* - * Disable interrupts and preemption to avoid this CPU's - * cgrp_cpuctx_entry to change under us. - */ - local_irq_save(flags); cgrp = perf_cgroup_from_task(task, NULL); - list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - if (READ_ONCE(cpuctx->cgrp) == cgrp) - continue; - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + return; - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to update_cgrp_time_from_cpuctx() in - * ctx_sched_out() - */ - cpuctx->cgrp = cgrp; - /* - * set cgrp before ctxsw in to allow - * perf_cgroup_set_timestamp() in ctx_sched_in() - * to not have to pass task around - */ - cpu_ctx_sched_in(cpuctx, EVENT_ALL); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_ctx_disable(&cpuctx->ctx); - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + ctx_sched_out(&cpuctx->ctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + ctx_sched_in(&cpuctx->ctx, EVENT_ALL); - local_irq_restore(flags); + perf_ctx_enable(&cpuctx->ctx); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -888,7 +886,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event, heap_size++; for_each_possible_cpu(cpu) { - cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; @@ -972,8 +970,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); - list_add(&cpuctx->cgrp_cpuctx_entry, - per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } static inline void @@ -994,7 +990,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c return; cpuctx->cgrp = NULL; - list_del(&cpuctx->cgrp_cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1069,34 +1064,30 @@ static void perf_cgroup_switch(struct task_struct *task) */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); - cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); + cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); + rotations = perf_rotate_context(cpc); - raw_spin_lock(&cpuctx->hrtimer_lock); + raw_spin_lock(&cpc->hrtimer_lock); if (rotations) - hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + hrtimer_forward_now(hr, cpc->hrtimer_interval); else - cpuctx->hrtimer_active = 0; - raw_spin_unlock(&cpuctx->hrtimer_lock); + cpc->hrtimer_active = 0; + raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; + struct pmu *pmu = cpc->epc.pmu; u64 interval; - /* no multiplexing needed for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return; - /* * check default is sane, if not set then force to * default interval (1/tick) @@ -1105,34 +1096,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - raw_spin_lock_init(&cpuctx->hrtimer_lock); + raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } -static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; - /* not for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return 0; - - raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); - if (!cpuctx->hrtimer_active) { - cpuctx->hrtimer_active = 1; - hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); + if (!cpc->hrtimer_active) { + cpc->hrtimer_active = 1; + hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } - raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); + raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } +static int perf_mux_hrtimer_restart_ipi(void *arg) +{ + return perf_mux_hrtimer_restart(arg); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1147,32 +1138,9 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, active_ctx_list); - -/* - * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and - * perf_event_task_tick() are fully serialized because they're strictly cpu - * affine and perf_event_ctx{activate,deactivate} are called with IRQs - * disabled, while perf_event_task_tick is called from IRQ context. - */ -static void perf_event_ctx_activate(struct perf_event_context *ctx) -{ - struct list_head *head = this_cpu_ptr(&active_ctx_list); - - lockdep_assert_irqs_disabled(); - - WARN_ON(!list_empty(&ctx->active_ctx_list)); - - list_add(&ctx->active_ctx_list, head); -} - -static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +static void perf_assert_pmu_disabled(struct pmu *pmu) { - lockdep_assert_irqs_disabled(); - - WARN_ON(list_empty(&ctx->active_ctx_list)); - - list_del_init(&ctx->active_ctx_list); + WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) @@ -1199,7 +1167,6 @@ static void free_ctx(struct rcu_head *head) struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -1384,7 +1351,7 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; @@ -1400,7 +1367,7 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) */ local_irq_save(*flags); rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might @@ -1413,7 +1380,7 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); @@ -1440,12 +1407,12 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) * reference count so that the context can't get freed. */ static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) +perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -1593,14 +1560,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event) * which provides ordering when rotating groups for the same CPU. */ static __always_inline int -perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, - const u64 left_group_index, const struct perf_event *right) +perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, + const struct cgroup *left_cgroup, const u64 left_group_index, + const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; + if (left_pmu) { + if (left_pmu < right->pmu_ctx->pmu) + return -1; + if (left_pmu > right->pmu_ctx->pmu) + return 1; + } + #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); @@ -1643,12 +1618,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); - return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, - __node_2_pe(b)) < 0; + return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), + e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; + struct pmu *pmu; struct cgroup *cgroup; }; @@ -1657,14 +1633,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node) const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); - /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ - return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); + /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); +} + +static inline int +__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), + b->group_index, b); } /* - * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for - * key (see perf_event_groups_less). This places it last inside the CPU - * subtree. + * Insert @event into @groups' tree; using + * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} + * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, @@ -1714,14 +1701,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the cpu/cgroup subtree. + * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, - struct cgroup *cgrp) + struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, + .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; @@ -1733,14 +1721,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu, return NULL; } -/* - * Like rb_entry_next_safe() for the @cpu subtree. - */ static struct perf_event * -perf_event_groups_next(struct perf_event *event) +perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, + .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; @@ -1752,6 +1738,10 @@ perf_event_groups_next(struct perf_event *event) return NULL; } +#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ + event; event = perf_event_groups_next(event, pmu)) + /* * Iterate through the whole groups tree. */ @@ -1796,6 +1786,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) perf_cgroup_event_enable(event, ctx); ctx->generation++; + event->pmu_ctx->nr_events++; } /* @@ -1941,7 +1932,8 @@ static void perf_group_attach(struct perf_event *event) lockdep_assert_held(&event->ctx->lock); /* - * We can have double attach due to group movement in perf_event_open. + * We can have double attach due to group movement (move_group) in + * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; @@ -2006,6 +1998,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) } ctx->generation++; + event->pmu_ctx->nr_events--; } static int @@ -2022,13 +2015,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *iter; /* @@ -2057,7 +2048,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, cpuctx, ctx); + event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -2108,8 +2099,8 @@ static int perf_get_aux_event(struct perf_event *event, static inline struct list_head *get_event_list(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; + return event->attr.pinned ? &event->pmu_ctx->pinned_active : + &event->pmu_ctx->flexible_active; } /* @@ -2120,10 +2111,7 @@ static inline struct list_head *get_event_list(struct perf_event *event) */ static inline void perf_remove_sibling_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } @@ -2212,53 +2200,22 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int __pmu_filter_match(struct perf_event *event) -{ - struct pmu *pmu = event->pmu; - return pmu->filter_match ? pmu->filter_match(event) : 1; -} - -/* - * Check whether we should attempt to schedule an event group based on - * PMU-specific filtering. An event group can consist of HW and SW events, - * potentially with a SW leader, so we must check all the filters, to - * determine whether a group is schedulable: - */ -static inline int pmu_filter_match(struct perf_event *event) -{ - struct perf_event *sibling; - unsigned long flags; - int ret = 1; - - if (!__pmu_filter_match(event)) - return 0; - - local_irq_save(flags); - for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) { - ret = 0; - break; - } - } - local_irq_restore(flags); - - return ret; -} - static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && - perf_cgroup_match(event) && pmu_filter_match(event); + perf_cgroup_match(event); } static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; + // XXX cpc serialization, probably per-cpu IRQ disabled + WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -2301,38 +2258,32 @@ event_sched_out(struct perf_event *event, perf_event_set_state(event, state); if (!is_software_event(event)) - cpuctx->active_oncpu--; - if (!--ctx->nr_active) - perf_event_ctx_deactivate(ctx); + cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; + if (event->attr.exclusive || !cpc->active_oncpu) + cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; - perf_pmu_disable(ctx->pmu); + perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) - event_sched_out(event, cpuctx, ctx); - - perf_pmu_enable(ctx->pmu); + event_sched_out(event, ctx); } #define DETACH_GROUP 0x01UL @@ -2351,6 +2302,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { + struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; if (ctx->is_active & EVENT_TIME) { @@ -2364,7 +2316,7 @@ __perf_remove_from_context(struct perf_event *event, */ if (flags & DETACH_DEAD) event->pending_disable = 1; - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) @@ -2373,12 +2325,23 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_DEAD) event->state = PERF_EVENT_STATE_DEAD; + if (!pmu_ctx->nr_events) { + pmu_ctx->rotate_necessary = 0; + + if (ctx->task && ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + } + if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; - ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2408,12 +2371,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - /* - * Cgroup events are per-cpu events, and must IPI because of - * cgrp_cpuctx_list. - */ - if (!ctx->is_active && !is_cgroup_event(event)) { - __perf_remove_from_context(event, __get_cpu_context(ctx), + if (!ctx->is_active) { + __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; @@ -2439,13 +2398,17 @@ static void __perf_event_disable(struct perf_event *event, update_cgrp_time_from_event(event); } + perf_pmu_disable(event->pmu_ctx->pmu); + if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); + group_sched_out(event, ctx); else - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); + + perf_pmu_enable(event->pmu_ctx->pmu); } /* @@ -2507,10 +2470,10 @@ static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2551,14 +2514,12 @@ event_sched_in(struct perf_event *event, } if (!is_software_event(event)) - cpuctx->active_oncpu++; - if (!ctx->nr_active++) - perf_event_ctx_activate(ctx); + cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) - cpuctx->exclusive = 1; + cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); @@ -2567,26 +2528,24 @@ event_sched_in(struct perf_event *event, } static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = ctx->pmu; + struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { - if (event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } @@ -2605,9 +2564,9 @@ group_sched_in(struct perf_event *group_event, if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); } - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); @@ -2617,10 +2576,11 @@ group_sched_in(struct perf_event *group_event, /* * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) +static int group_can_go_on(struct perf_event *event, int can_add_hw) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + /* * Groups consisting entirely of software events can always go on. */ @@ -2630,7 +2590,7 @@ static int group_can_go_on(struct perf_event *event, * If an exclusive group is already on, no other hardware * events can go on. */ - if (cpuctx->exclusive) + if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already @@ -2652,36 +2612,29 @@ static void add_event_to_ctx(struct perf_event *event, perf_group_attach(event); } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - enum event_type_t event_type) +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, EVENT_FLEXIBLE); } /* @@ -2699,11 +2652,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ +/* + * XXX: ctx_resched() reschedule entire perf_event_context while adding new + * event to the context or enabling existing event in the context. We can + * probably optimize it by rescheduling only affected pmu_ctx. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { - enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* @@ -2713,11 +2670,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; - ctx_event_type = event_type & EVENT_ALL; + event_type &= EVENT_ALL; - perf_pmu_disable(cpuctx->ctx.pmu); - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx, event_type); + perf_ctx_disable(&cpuctx->ctx); + if (task_ctx) { + perf_ctx_disable(task_ctx); + task_ctx_sched_out(task_ctx, event_type); + } /* * Decide which cpu ctx groups to schedule out based on the types @@ -2727,17 +2686,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - cpu_ctx_sched_out(cpuctx, ctx_event_type); - else if (ctx_event_type & EVENT_PINNED) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, event_type); + else if (event_type & EVENT_PINNED) + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx); - perf_pmu_enable(cpuctx->ctx.pmu); + + perf_ctx_enable(&cpuctx->ctx); + if (task_ctx) + perf_ctx_enable(task_ctx); } void perf_pmu_resched(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); @@ -2755,7 +2717,7 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; @@ -2797,7 +2759,7 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { @@ -2830,7 +2792,7 @@ perf_install_in_context(struct perf_event_context *ctx, WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) - event->cpu = cpu; + WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx @@ -2842,8 +2804,6 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. - * Similarly, cgroup events for the context also needs the IPI to - * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. @@ -2945,7 +2905,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (ctx->is_active) - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2954,7 +2914,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); return; } @@ -2963,7 +2923,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); return; } @@ -3232,11 +3192,52 @@ static int perf_event_modify_attr(struct perf_event *event, return err; } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { + struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; + struct pmu *pmu = pmu_ctx->pmu; + + if (ctx->task && !ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + + if (!event_type) + return; + + perf_pmu_disable(pmu); + if (event_type & EVENT_PINNED) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->pinned_active, + active_list) + group_sched_out(event, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->flexible_active, + active_list) + group_sched_out(event, ctx); + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + pmu_ctx->rotate_necessary = 0; + } + perf_pmu_enable(pmu); +} + +static void +ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); @@ -3284,27 +3285,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (!ctx->nr_active || !(is_active & EVENT_ALL)) - return; - - perf_pmu_disable(ctx->pmu); - if (is_active & EVENT_PINNED) { - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) - group_sched_out(event, cpuctx, ctx); - } - - if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) - group_sched_out(event, cpuctx, ctx); - - /* - * Since we cleared EVENT_FLEXIBLE, also clear - * rotate_necessary, is will be reset by - * ctx_flexible_sched_in() when needed. - */ - ctx->rotate_necessary = 0; - } - perf_pmu_enable(ctx->pmu); + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + __pmu_ctx_sched_out(pmu_ctx, is_active); } /* @@ -3409,26 +3391,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ + for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ + pos2 = list_first_entry(head2, typeof(*pos2), member); \ + !list_entry_is_head(pos1, head1, member) && \ + !list_entry_is_head(pos2, head2, member); \ + pos1 = list_next_entry(pos1, member), \ + pos2 = list_next_entry(pos2, member)) + +static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event_pmu_context *prev_epc, *next_epc; + + if (!prev_ctx->nr_task_data) + return; + + double_list_for_each_entry(prev_epc, next_epc, + &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, + pmu_ctx_entry) { + + if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) + continue; + + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (prev_epc->pmu->swap_task_ctx) + prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); + else + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); + } +} + +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +{ + struct perf_event_pmu_context *pmu_ctx; + struct perf_cpu_pmu_context *cpc; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + + if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) + pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + } +} + +static void +perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; - struct perf_cpu_context *cpuctx; int do_switch = 1; - struct pmu *pmu; if (likely(!ctx)) return; - pmu = ctx->pmu; - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - rcu_read_lock(); - next_ctx = next->perf_event_ctxp[ctxn]; + next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; @@ -3453,7 +3477,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || @@ -3470,21 +3494,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); - - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (pmu->swap_task_ctx) - pmu->swap_task_ctx(ctx, next_ctx); - else - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_ctx_sched_task_cb(ctx, false); + perf_event_swap_task_ctx_data(ctx, next_ctx); - perf_pmu_enable(pmu); + perf_ctx_enable(ctx); /* * RCU_INIT_POINTER here is safe because we've not @@ -3493,8 +3506,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, * since those values are always verified under * ctx->lock which we're now holding. */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); - RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; @@ -3508,38 +3521,40 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, if (do_switch) { raw_spin_lock(&ctx->lock); - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); inside_switch: - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); - task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + perf_ctx_sched_task_cb(ctx, false); + task_ctx_sched_out(ctx, EVENT_ALL); - perf_pmu_enable(pmu); + perf_ctx_enable(ctx); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); + barrier(); - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + if (!--cpc->sched_cb_usage) + list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + if (!cpc->sched_cb_usage++) + list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + barrier(); this_cpu_inc(perf_sched_cb_usages); } @@ -3551,19 +3566,21 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + pmu = cpc->epc.pmu; + /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3573,26 +3590,20 @@ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cpu_pmu_context *cpc; - if (prev == next) + /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ + if (prev == next || cpuctx->task_ctx) return; - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - /* will be handled in perf_event_context_sched_in/out */ - if (cpuctx->task_ctx) - continue; - - __perf_pmu_sched_task(cpuctx, sched_in); - } + list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. @@ -3607,16 +3618,13 @@ static void perf_event_switch(struct task_struct *task, void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); + perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need @@ -3627,15 +3635,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_switch(next); } -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); -} - static bool perf_less_group_idx(const void *l, const void *r) { const struct perf_event *le = *(const struct perf_event **)l; @@ -3667,21 +3666,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event) } } -static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, +static void __link_epc(struct perf_event_pmu_context *pmu_ctx) +{ + struct perf_cpu_pmu_context *cpc; + + if (!pmu_ctx->ctx->task) + return; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = pmu_ctx; +} + +static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, + struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif + struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct min_heap event_heap; struct perf_event **evt; int ret; - if (cpuctx) { + if (pmu->filter && pmu->filter(pmu, cpu)) + return 0; + + if (!ctx->task) { + cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct min_heap){ .data = cpuctx->heap, .nr = 0, @@ -3701,17 +3718,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ - __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif + if (event_heap.nr) { + __link_epc((*evt)->pmu_ctx); + perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); + } + min_heapify_all(&event_heap, &perf_min_heap); while (event_heap.nr) { @@ -3719,7 +3741,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, if (ret) return ret; - *evt = perf_event_groups_next(*evt); + *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heapify(&event_heap, 0, &perf_min_heap); else @@ -3761,7 +3783,6 @@ static inline void group_update_userpage(struct perf_event *group_event) static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) @@ -3770,8 +3791,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, cpuctx, *can_add_hw)) { - if (!group_sched_in(event, cpuctx, ctx)) + if (group_can_go_on(event, *can_add_hw)) { + if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } @@ -3781,8 +3802,11 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); + struct perf_cpu_pmu_context *cpc; + + event->pmu_ctx->rotate_necessary = 1; + cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); + perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } @@ -3790,39 +3814,53 @@ static int merge_sched_in(struct perf_event *event, void *data) return 0; } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { + struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - if (ctx != &cpuctx->ctx) - cpuctx = NULL; - - visit_groups_merge(cpuctx, &ctx->pinned_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); + if (pmu) { + visit_groups_merge(ctx, &ctx->pinned_groups, + smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); + } else { + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + can_add_hw = 1; + visit_groups_merge(ctx, &ctx->pinned_groups, + smp_processor_id(), pmu_ctx->pmu, + merge_sched_in, &can_add_hw); + } + } } -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { + struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - if (ctx != &cpuctx->ctx) - cpuctx = NULL; + if (pmu) { + visit_groups_merge(ctx, &ctx->flexible_groups, + smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); + } else { + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + can_add_hw = 1; + visit_groups_merge(ctx, &ctx->flexible_groups, + smp_processor_id(), pmu_ctx->pmu, + merge_sched_in, &can_add_hw); + } + } +} - visit_groups_merge(cpuctx, &ctx->flexible_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +{ + ctx_flexible_sched_in(ctx, pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); @@ -3856,39 +3894,32 @@ ctx_sched_in(struct perf_event_context *ctx, * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + ctx_pinned_sched_in(ctx, NULL); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + ctx_flexible_sched_in(ctx, NULL); } -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void perf_event_context_sched_in(struct task_struct *task) { - struct perf_event_context *ctx = &cpuctx->ctx; - - ctx_sched_in(ctx, cpuctx, event_type); -} + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp); + if (!ctx) + goto rcu_unlock; - cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) { + perf_ctx_lock(cpuctx, ctx); + perf_ctx_disable(ctx); - /* - * HACK: for HETEROGENEOUS the task context might have switched to a - * different PMU, force (re)set the context, - */ - pmu = ctx->pmu = cpuctx->ctx.pmu; + perf_ctx_sched_task_cb(ctx, true); - if (cpuctx->task_ctx == ctx) { - if (cpuctx->sched_cb_usage) - __perf_pmu_sched_task(cpuctx, true); - return; + perf_ctx_enable(ctx); + perf_ctx_unlock(cpuctx, ctx); + goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); @@ -3899,7 +3930,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3908,17 +3939,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { + perf_ctx_disable(&cpuctx->ctx); + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + } + perf_event_sched_in(cpuctx, ctx); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, true); - perf_pmu_enable(pmu); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) + perf_ctx_enable(&cpuctx->ctx); + + perf_ctx_enable(ctx); unlock: perf_ctx_unlock(cpuctx, ctx); +rcu_unlock: + rcu_read_unlock(); } /* @@ -3935,16 +3973,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } + perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -4063,8 +4092,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, - int needs_unthr) +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event *event; struct hw_perf_event *hwc; @@ -4076,16 +4105,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ - if (!(ctx->nr_freq || needs_unthr)) + if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; @@ -4126,7 +4155,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_pmu_enable(event->pmu); } - perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4148,72 +4176,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_event_to_rotate(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; + struct rb_node *node; + struct rb_root *tree; + struct __group_key key = { + .pmu = pmu_ctx->pmu, + }; /* pick the first active flexible event */ - event = list_first_entry_or_null(&ctx->flexible_active, + event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); + if (event) + goto out; /* if no active flexible event, pick the first event */ - if (!event) { - event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), - typeof(*event), group_node); - } + tree = &pmu_ctx->ctx->flexible_groups.tree; + if (!pmu_ctx->ctx->task) { + key.cpu = smp_processor_id(); + + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + goto out; + } + + key.cpu = -1; + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) { + event = __node_2_pe(node); + goto out; + } + + key.cpu = smp_processor_id(); + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + +out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ - ctx->rotate_necessary = 0; + pmu_ctx->rotate_necessary = 0; return event; } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx) +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; - struct perf_event_context *task_ctx = NULL; int cpu_rotate, task_rotate; + struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - cpu_rotate = cpuctx->ctx.rotate_necessary; - task_ctx = cpuctx->task_ctx; - task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; + cpu_epc = &cpc->epc; + pmu = cpu_epc->pmu; + task_epc = cpc->task_epc; + + cpu_rotate = cpu_epc->rotate_necessary; + task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_pmu_disable(pmu); if (task_rotate) - task_event = ctx_event_to_rotate(task_ctx); + task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) - cpu_event = ctx_event_to_rotate(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (task_ctx && cpu_event)) - ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); - if (cpu_event) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_epc && cpu_event)) { + update_context_time(task_epc->ctx); + __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); + } - if (task_event) - rotate_ctx(task_ctx, task_event); - if (cpu_event) + if (cpu_event) { + update_context_time(&cpuctx->ctx); + __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); + __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + } - perf_event_sched_in(cpuctx, task_ctx); + if (task_event) + rotate_ctx(task_epc->ctx, task_event); - perf_pmu_enable(cpuctx->ctx.pmu); + if (task_event || (task_epc && cpu_event)) + __pmu_ctx_sched_in(task_epc->ctx, pmu); + + perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; @@ -4221,8 +4286,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - struct perf_event_context *ctx, *tmp; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); @@ -4231,8 +4296,13 @@ void perf_event_task_tick(void) throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) - perf_adjust_freq_unthr_context(ctx, throttled); + perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); + + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_adjust_freq_unthr_context(ctx, !!throttled); + rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, @@ -4254,9 +4324,9 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(int ctxn) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; @@ -4264,13 +4334,16 @@ static void perf_event_enable_on_exec(int ctxn) int enabled = 0; local_irq_save(flags); - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx || !ctx->nr_events) + if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) + goto out; + + if (!ctx->nr_events) goto out; - cpuctx = __get_cpu_context(ctx); + cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); @@ -4283,7 +4356,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4302,17 +4375,13 @@ static void perf_event_exit_event(struct perf_event *event, * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ -static void perf_event_remove_on_exec(int ctxn) +static void perf_event_remove_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; - ctx = perf_pin_task_context(current, ctxn); - if (!ctx) - return; - mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) @@ -4333,13 +4402,11 @@ static void perf_event_remove_on_exec(int ctxn) raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); - --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); - put_ctx(ctx); if (clone_ctx) put_ctx(clone_ctx); } @@ -4375,7 +4442,7 @@ static void __perf_event_read(void *info) struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* @@ -4601,17 +4668,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->active_ctx_list); + INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); - INIT_LIST_HEAD(&ctx->pinned_active); - INIT_LIST_HEAD(&ctx->flexible_active); refcount_set(&ctx->refcount, 1); } +static void +__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) +{ + epc->pmu = pmu; + INIT_LIST_HEAD(&epc->pmu_ctx_entry); + INIT_LIST_HEAD(&epc->pinned_active); + INIT_LIST_HEAD(&epc->flexible_active); + atomic_set(&epc->refcount, 1); +} + static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) +alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; @@ -4622,7 +4697,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); - ctx->pmu = pmu; return ctx; } @@ -4651,15 +4725,12 @@ find_lively_task_by_vpid(pid_t vpid) * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, - struct perf_event *event) +find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; - void *task_ctx_data = NULL; unsigned long flags; - int ctxn, err; - int cpu = event->cpu; + int err; if (!task) { /* Must be root to operate on a CPU event: */ @@ -4667,7 +4738,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (err) return ERR_PTR(err); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); @@ -4678,43 +4749,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task, } err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - err = -ENOMEM; - goto errout; - } - } - retry: - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; - if (task_ctx_data && !ctx->task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { - ctx = alloc_perf_context(pmu, task); + ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; - if (task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -4723,12 +4773,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, */ if (task->flags & PF_EXITING) err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) + else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); @@ -4741,21 +4791,146 @@ find_get_context(struct pmu *pmu, struct task_struct *task, } } - free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } +static struct perf_event_pmu_context * +find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, + struct perf_event *event) +{ + struct perf_event_pmu_context *new = NULL, *epc; + void *task_ctx_data = NULL; + + if (!ctx->task) { + struct perf_cpu_pmu_context *cpc; + + cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + epc = &cpc->epc; + + if (!epc->ctx) { + atomic_set(&epc->refcount, 1); + epc->embedded = 1; + raw_spin_lock_irq(&ctx->lock); + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + raw_spin_unlock_irq(&ctx->lock); + } else { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + } + + return epc; + } + + new = kzalloc(sizeof(*epc), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = alloc_task_ctx_data(pmu); + if (!task_ctx_data) { + kfree(new); + return ERR_PTR(-ENOMEM); + } + } + + __perf_init_event_pmu_context(new, pmu); + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because perf_event_init_task() doesn't actually hold the + * child_ctx->mutex. + */ + + raw_spin_lock_irq(&ctx->lock); + list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (epc->pmu == pmu) { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + goto found_epc; + } + } + + epc = new; + new = NULL; + + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + +found_epc: + if (task_ctx_data && !epc->task_ctx_data) { + epc->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + ctx->nr_task_data++; + } + raw_spin_unlock_irq(&ctx->lock); + + free_task_ctx_data(pmu, task_ctx_data); + kfree(new); + + return epc; +} + +static void get_pmu_ctx(struct perf_event_pmu_context *epc) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); +} + +static void free_epc_rcu(struct rcu_head *head) +{ + struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); + + kfree(epc->task_ctx_data); + kfree(epc); +} + +static void put_pmu_ctx(struct perf_event_pmu_context *epc) +{ + unsigned long flags; + + if (!atomic_dec_and_test(&epc->refcount)) + return; + + if (epc->ctx) { + struct perf_event_context *ctx = epc->ctx; + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + WARN_ON_ONCE(!list_empty(&epc->pinned_active)); + WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + + if (epc->embedded) + return; + + call_rcu(&epc->rcu_head, free_epc_rcu); +} + static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { - struct perf_event *event; + struct perf_event *event = container_of(head, typeof(*event), rcu_head); - event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); @@ -4893,7 +5068,7 @@ static void perf_sched_delayed(struct work_struct *work) * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, - * 3) two matching events on the same context. + * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). @@ -5017,6 +5192,9 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); + if (event->pmu_ctx) + put_pmu_ctx(event->pmu_ctx); + /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. @@ -5117,8 +5295,8 @@ int perf_event_release_kernel(struct perf_event *event) LIST_HEAD(free_list); /* - * If we got here through err_file: fput(event_file); we will not have - * attached to a context yet. + * If we got here through err_alloc: free_event(event); we will not + * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & @@ -5550,7 +5728,7 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. @@ -5566,7 +5744,7 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); - perf_pmu_enable(ctx->pmu); + perf_pmu_enable(event->pmu); } } @@ -7729,7 +7907,6 @@ perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; - int ctxn; rcu_read_lock(); preempt_disable(); @@ -7746,11 +7923,9 @@ perf_iterate_sb(perf_iterate_f output, void *data, perf_iterate_sb_cpu(output, data); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_iterate_ctx(ctx, output, data, false); - } + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); @@ -7792,20 +7967,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) void perf_event_exec(void) { struct perf_event_context *ctx; - int ctxn; - for_each_task_context_nr(ctxn) { - perf_event_enable_on_exec(ctxn); - perf_event_remove_on_exec(ctxn); + ctx = perf_pin_task_context(current); + if (!ctx) + return; - rcu_read_lock(); - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, - NULL, true); - } - rcu_read_unlock(); - } + perf_event_enable_on_exec(ctx); + perf_event_remove_on_exec(ctx); + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + + perf_unpin_context(ctx); + put_ctx(ctx); } struct remote_output { @@ -7845,8 +8017,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->ctx->pmu; - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; @@ -8635,7 +8806,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; - int ctxn; /* * Data tracing isn't supported yet and as such there is no need @@ -8645,13 +8815,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) return; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (!ctx) - continue; - + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); - } rcu_read_unlock(); } @@ -9826,6 +9992,44 @@ static struct pmu perf_swevent = { #ifdef CONFIG_EVENT_TRACING +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -9875,6 +10079,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); +static void __perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_event *event) +{ + struct trace_entry *entry = record; + + if (event->attr.config != entry->type) + return; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + return; + if (perf_tp_event_match(event, data, regs)) + perf_swevent_event(event, count, data, regs); +} + +static void perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_event_context *ctx) +{ + unsigned int cpu = smp_processor_id(); + struct pmu *pmu = &perf_tracepoint; + struct perf_event *event, *sibling; + + perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, sibling); + } + + perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, sibling); + } +} + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) @@ -9906,26 +10148,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, */ if (task && task != current) { struct perf_event_context *ctx; - struct trace_entry *entry = record; rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->cpu != smp_processor_id()) - continue; - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - /* Cannot deliver synchronous signal to other task. */ - if (event->attr.sigtrap) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } + raw_spin_lock(&ctx->lock); + perf_tp_event_target_task(count, record, regs, &data, ctx); + raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } @@ -9934,44 +10165,6 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, } EXPORT_SYMBOL_GPL(perf_tp_event); -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - /* - * no branch sampling for tracepoint events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe @@ -11058,46 +11251,19 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } +static void free_pmu_context(struct pmu *pmu) +{ + free_percpu(pmu->cpu_pmu_context); +} + /* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. + * Let userspace know that this PMU supports address range filtering: */ -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) +static ssize_t nr_addr_filters_show(struct device *dev, + struct device_attribute *attr, + char *page) { - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - -static void free_pmu_context(struct pmu *pmu) -{ - /* - * Static contexts such as perf_sw_context have a global lifetime - * and may be shared between different PMUs. Avoid freeing them - * when a single PMU is going away. - */ - if (pmu->task_ctx_nr > perf_invalid_context) - return; - - free_percpu(pmu->pmu_cpu_context); -} - -/* - * Let userspace know that this PMU supports address range filtering: - */ -static ssize_t nr_addr_filters_show(struct device *dev, - struct device_attribute *attr, - char *page) -{ - struct pmu *pmu = dev_get_drvdata(dev); + struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } @@ -11151,12 +11317,11 @@ perf_event_mux_interval_ms_store(struct device *dev, /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + struct perf_cpu_pmu_context *cpc; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - cpu_function_call(cpu, - (remote_function_f)perf_mux_hrtimer_restart, cpuctx); + cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); @@ -11269,47 +11434,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: - if (pmu->task_ctx_nr == perf_hw_context) { - static int hw_context_taken = 0; - - /* - * Other than systems with heterogeneous CPUs, it never makes - * sense for two PMUs to share perf_hw_context. PMUs which are - * uncore must use perf_invalid_context. - */ - if (WARN_ON_ONCE(hw_context_taken && - !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) - pmu->task_ctx_nr = perf_invalid_context; - - hw_context_taken = 1; - } - - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - ret = -ENOMEM; - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.pmu = pmu; - cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); - - __perf_mux_hrtimer_init(cpuctx, cpu); + struct perf_cpu_pmu_context *cpc; - cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); - cpuctx->heap = cpuctx->heap_default; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + __perf_init_event_pmu_context(&cpc->epc, pmu); + __perf_mux_hrtimer_init(cpc, cpu); } -got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -11788,10 +11925,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } /* - * Disallow uncore-cgroup events, they don't make sense as the cgroup will - * be different on other CPUs in the uncore mask. + * Disallow uncore-task events. Similarly, disallow uncore-cgroup + * events (they don't make sense as the cgroup will be different + * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } @@ -12138,37 +12276,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } -/* - * Variation on perf_event_ctx_lock_nested(), except we take two context - * mutexes. - */ -static struct perf_event_context * -__perf_event_ctx_lock_double(struct perf_event *group_leader, - struct perf_event_context *ctx) -{ - struct perf_event_context *gctx; - -again: - rcu_read_lock(); - gctx = READ_ONCE(group_leader->ctx); - if (!refcount_inc_not_zero(&gctx->refcount)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - - mutex_lock_double(&gctx->mutex, &ctx->mutex); - - if (group_leader->ctx != gctx) { - mutex_unlock(&ctx->mutex); - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - goto again; - } - - return gctx; -} - static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { @@ -12214,9 +12321,10 @@ SYSCALL_DEFINE5(perf_event_open, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *gctx; + struct perf_event_context *ctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -12346,42 +12454,53 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader) { - if (is_software_event(event) && - !in_software_context(group_leader)) { - /* - * If the event is a sw event, but the group_leader - * is on hw context. - * - * Allow the addition of software events to hw - * groups, this is safe because software events - * never fail to schedule. - */ - pmu = group_leader->ctx->pmu; - } else if (!is_software_event(event) && - is_software_event(group_leader) && - (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } + if (task) { + err = down_read_interruptible(&task->signal->exec_update_lock); + if (err) + goto err_alloc; + + /* + * We must hold exec_update_lock across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perf_check_permission(&attr, task)) + goto err_cred; } /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_alloc; + goto err_cred; + } + + mutex_lock(&ctx->mutex); + + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } } - /* - * Look up the group leader (we will attach this event to it): - */ if (group_leader) { err = -EINVAL; @@ -12390,11 +12509,11 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_context; + goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) - goto err_context; + goto err_locked; /* * Make sure we're both events for the same CPU; @@ -12402,145 +12521,76 @@ SYSCALL_DEFINE5(perf_event_open, * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) - goto err_context; - - /* - * Make sure we're both on the same task, or both - * per-CPU events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + goto err_locked; /* - * Do not allow to attach to a group in a different task - * or CPU context. If we're moving SW events, we'll fix - * this up later, so allow that. - * - * Racy, not holding group_leader->ctx->mutex, see comment with - * perf_event_ctx_lock(). + * Make sure we're both on the same context; either task or cpu. */ - if (!move_group && group_leader->ctx != ctx) - goto err_context; + if (group_leader->ctx != ctx) + goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, - f_flags); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - event_file = NULL; - goto err_context; - } - - if (task) { - err = down_read_interruptible(&task->signal->exec_update_lock); - if (err) - goto err_file; - - /* - * We must hold exec_update_lock across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perf_check_permission(&attr, task)) - goto err_cred; - } - - if (move_group) { - gctx = __perf_event_ctx_lock_double(group_leader, ctx); - - if (gctx->task == TASK_TOMBSTONE) { - err = -ESRCH; goto err_locked; - } - /* - * Check if we raced against another sys_perf_event_open() call - * moving the software group underneath us. - */ - if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If someone moved the group out from under us, check - * if this new event wound up on the same ctx, if so - * its the regular !move_group case, otherwise fail. + * If the event is a sw event, but the group_leader + * is on hw context. + * + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. + * + * Note the comment that goes with struct + * perf_event_pmu_context. */ - if (gctx != ctx) { - err = -EINVAL; - goto err_locked; - } else { - perf_event_ctx_unlock(group_leader, gctx); - move_group = 0; - goto not_move_group; + pmu = group_leader->pmu_ctx->pmu; + } else if (!is_software_event(event)) { + if (is_software_event(group_leader) && + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; } - } - /* - * Failure to create exclusive events returns -EBUSY. - */ - err = -EBUSY; - if (!exclusive_event_installable(group_leader, ctx)) - goto err_locked; - - for_each_sibling_event(sibling, group_leader) { - if (!exclusive_event_installable(sibling, ctx)) + /* Don't allow group of multiple hw events from different pmus */ + if (!in_software_context(group_leader) && + group_leader->pmu_ctx->pmu != pmu) goto err_locked; } - } else { - mutex_lock(&ctx->mutex); - - /* - * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, - * see the group_leader && !move_group test earlier. - */ - if (group_leader && group_leader->ctx != ctx) { - err = -EINVAL; - goto err_locked; - } } -not_move_group: - if (ctx->task == TASK_TOMBSTONE) { - err = -ESRCH; + /* + * Now that we're certain of the pmu; find the pmu_ctx. + */ + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); goto err_locked; } + event->pmu_ctx = pmu_ctx; - if (!perf_event_validate_size(event)) { - err = -E2BIG; - goto err_locked; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; } - if (!task) { - /* - * Check if the @cpu we're creating an event for is online. - * - * We use the perf_cpu_context::ctx::mutex to serialize against - * the hotplug notifiers. See perf_event_{init,exit}_cpu(). - */ - struct perf_cpu_context *cpuctx = - container_of(ctx, struct perf_cpu_context, ctx); - - if (!cpuctx->online) { - err = -ENODEV; - goto err_locked; - } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; - goto err_locked; + goto err_context; } /* @@ -12549,35 +12599,32 @@ SYSCALL_DEFINE5(perf_event_open, */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_locked; + goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + event_file = NULL; + goto err_context; + } + /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ perf_remove_from_context(group_leader, 0); - put_ctx(gctx); + put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); - put_ctx(gctx); + put_pmu_ctx(sibling->pmu_ctx); } - /* - * Wait for everybody to stop referencing the events through - * the old lists, before installing it on new lists. - */ - synchronize_rcu(); - /* * Install the group siblings before the group leader. * @@ -12589,9 +12636,10 @@ SYSCALL_DEFINE5(perf_event_open, * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { + sibling->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); - get_ctx(ctx); } /* @@ -12599,9 +12647,10 @@ SYSCALL_DEFINE5(perf_event_open, * event. What we want here is event in the initial * startup state, ready to be add into new context. */ + group_leader->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); } /* @@ -12618,8 +12667,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -12641,25 +12688,17 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; +err_context: + /* event->pmu_ctx freed by free_event() */ err_locked: - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); -err_file: - fput(event_file); -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); err_alloc: - /* - * If event_file is set, the fput() above will have called ->release() - * and that will take care of freeing the event. - */ - if (!event_file) - free_event(event); + free_event(event); err_task: if (task) put_task_struct(task); @@ -12685,8 +12724,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, perf_overflow_handler_t overflow_handler, void *context) { + struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; + struct pmu *pmu; int err; /* @@ -12705,14 +12746,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + pmu = event->pmu; + + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ - ctx = find_get_context(event->pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_free; + goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); @@ -12722,6 +12767,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); + goto err_unlock; + } + event->pmu_ctx = pmu_ctx; + if (!task) { /* * Check if the @cpu we're creating an event for is online. @@ -12733,13 +12785,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; - goto err_unlock; + goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_unlock; + goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); @@ -12748,44 +12800,61 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_pmu_ctx: + put_pmu_ctx(pmu_ctx); err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); -err_free: +err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +static void __perf_pmu_remove(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, + struct perf_event_groups *groups, + struct list_head *events) { - struct perf_event_context *src_ctx; - struct perf_event_context *dst_ctx; - struct perf_event *event, *tmp; - LIST_HEAD(events); - - src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; - dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + struct perf_event *event, *sibling; - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ - mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { + perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + unaccount_event_cpu(event, cpu); + put_pmu_ctx(event->pmu_ctx); + list_add(&event->migrate_entry, events); + + for_each_sibling_event(sibling, event) { + perf_remove_from_context(sibling, 0); + unaccount_event_cpu(sibling, cpu); + put_pmu_ctx(sibling->pmu_ctx); + list_add(&sibling->migrate_entry, events); + } } +} - /* - * Wait for the events to quiesce before re-instating them. - */ - synchronize_rcu(); +static void __perf_pmu_install_event(struct pmu *pmu, + struct perf_event_context *ctx, + int cpu, struct perf_event *event) +{ + struct perf_event_pmu_context *epc; + + event->cpu = cpu; + epc = find_get_pmu_context(pmu, ctx, event); + event->pmu_ctx = epc; + + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, cpu); + perf_install_in_context(ctx, event, cpu); +} + +static void __perf_pmu_install(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, struct list_head *events) +{ + struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. @@ -12795,30 +12864,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) * leader will enable its siblings, even if those are still on the old * context. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } +} + +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx, *dst_ctx; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); + + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); + + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } @@ -12898,14 +12985,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); - child_ctx = perf_pin_task_context(child, ctxn); + child_ctx = perf_pin_task_context(child); if (!child_ctx) return; @@ -12927,13 +13014,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ @@ -12968,7 +13055,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; - int ctxn; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, @@ -12984,8 +13070,7 @@ void perf_event_exit_task(struct task_struct *child) } mutex_unlock(&child->perf_event_mutex); - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); + perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task @@ -13028,56 +13113,51 @@ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; - int ctxn; - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; + ctx = rcu_access_pointer(task->perf_event_ctxp); + if (!ctx) + return; - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); + mutex_lock(&ctx->mutex); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - mutex_unlock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) + perf_free_event(event, ctx); - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ - } + mutex_unlock(&ctx->mutex); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); + WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) @@ -13127,6 +13207,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; @@ -13147,17 +13228,12 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - - if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && - !child_ctx->task_ctx_data) { - struct pmu *pmu = child_event->pmu; - - child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); - if (!child_ctx->task_ctx_data) { - free_event(child_event); - return ERR_PTR(-ENOMEM); - } + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + if (IS_ERR(pmu_ctx)) { + free_event(child_event); + return NULL; } + child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) @@ -13280,11 +13356,11 @@ static int inherit_group(struct perf_event *parent_event, static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, + struct task_struct *child, u64 clone_flags, int *inherited_all) { - int ret; struct perf_event_context *child_ctx; + int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || @@ -13294,7 +13370,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -13302,16 +13378,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); + child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp[ctxn] = child_ctx; + child->perf_event_ctxp = child_ctx; } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; @@ -13321,8 +13395,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn, - u64 clone_flags) +static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -13332,14 +13405,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, unsigned long flags; int ret = 0; - if (likely(!parent->perf_event_ctxp[ctxn])) + if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent, ctxn); + parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; @@ -13362,8 +13435,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, clone_flags, - &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -13379,8 +13451,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, clone_flags, - &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -13388,7 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* @@ -13424,18 +13495,16 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { - int ctxn, ret; + int ret; - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn, clone_flags); - if (ret) { - perf_event_free_task(child); - return ret; - } + ret = perf_event_init_context(child, clone_flags); + if (ret) { + perf_event_free_task(child); + return ret; } return 0; @@ -13444,6 +13513,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; + struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); @@ -13451,15 +13521,19 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); -#ifdef CONFIG_CGROUP_PERF - INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); -#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); + + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } } @@ -13481,12 +13555,12 @@ static void perf_swevent_init_cpu(unsigned int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); @@ -13496,18 +13570,16 @@ static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; + // XXX simplify cpuctx->online mutex_lock(&pmus_lock); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - cpuctx->online = 0; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; + mutex_unlock(&ctx->mutex); cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } @@ -13521,20 +13593,17 @@ int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); cpumask_set_cpu(cpu, perf_online_mask); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - cpuctx->online = 1; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; @@ -13671,9 +13740,12 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; - rcu_read_lock(); - perf_cgroup_switch(task); - rcu_read_unlock(); + + preempt_disable(); + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_switch(task); + preempt_enable(); + return 0; } -- 2.40.0 From 2acf657d1326d96d5b35da166bae86671f5295c6 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 8 Dec 2022 16:23:16 +0100 Subject: [PATCH 13/17] printk Signed-off-by: Peter Jung --- .clang-format | 1 + arch/m68k/emu/nfcon.c | 9 +- arch/um/kernel/kmsg_dump.c | 24 +- drivers/firmware/efi/earlycon.c | 8 +- drivers/net/netconsole.c | 21 +- drivers/tty/hvc/hvc_console.c | 4 +- drivers/tty/serial/8250/8250_core.c | 2 +- drivers/tty/serial/earlycon.c | 4 +- drivers/tty/serial/kgdboc.c | 46 +- drivers/tty/serial/pic32_uart.c | 4 +- drivers/tty/serial/samsung_tty.c | 2 +- drivers/tty/serial/serial_core.c | 14 +- drivers/tty/serial/sh-sci.c | 20 +- drivers/tty/serial/xilinx_uartps.c | 2 +- drivers/tty/tty_io.c | 18 +- drivers/usb/early/xhci-dbc.c | 2 +- drivers/video/fbdev/xen-fbfront.c | 12 +- fs/proc/consoles.c | 21 +- include/linux/console.h | 227 +++++++-- include/linux/printk.h | 2 - include/linux/rcutiny.h | 4 - include/linux/serial_core.h | 10 +- include/linux/srcu.h | 5 +- kernel/debug/kdb/kdb_io.c | 18 +- kernel/printk/internal.h | 45 ++ kernel/printk/printk.c | 725 ++++++++++++++++++++-------- kernel/printk/printk_ringbuffer.c | 2 +- kernel/rcu/Kconfig | 11 +- kernel/rcu/Kconfig.debug | 3 - kernel/rcu/rcu.h | 8 +- kernel/rcu/srcutiny.c | 10 + kernel/rcu/update.c | 18 +- lib/test_printf.c | 26 +- lib/vsprintf.c | 2 +- 34 files changed, 979 insertions(+), 351 deletions(-) diff --git a/.clang-format b/.clang-format index 8d01225bfcb7..bcf60344f9f5 100644 --- a/.clang-format +++ b/.clang-format @@ -222,6 +222,7 @@ ForEachMacros: - 'for_each_component_dais' - 'for_each_component_dais_safe' - 'for_each_console' + - 'for_each_console_srcu' - 'for_each_cpu' - 'for_each_cpu_and' - 'for_each_cpu_not' diff --git a/arch/m68k/emu/nfcon.c b/arch/m68k/emu/nfcon.c index 557d60867f98..6fdc13610565 100644 --- a/arch/m68k/emu/nfcon.c +++ b/arch/m68k/emu/nfcon.c @@ -49,7 +49,7 @@ static void nfcon_write(struct console *con, const char *str, static struct tty_driver *nfcon_device(struct console *con, int *index) { *index = 0; - return (con->flags & CON_ENABLED) ? nfcon_tty_driver : NULL; + return console_is_registered(con) ? nfcon_tty_driver : NULL; } static struct console nf_console = { @@ -107,6 +107,11 @@ static int __init nf_debug_setup(char *arg) stderr_id = nf_get_id("NF_STDERR"); if (stderr_id) { + /* + * The console will be enabled when debug=nfcon is specified + * as a kernel parameter. Since this is a non-standard way + * of enabling consoles, it must be explicitly enabled. + */ nf_console.flags |= CON_ENABLED; register_console(&nf_console); } @@ -151,7 +156,7 @@ static int __init nfcon_init(void) nfcon_tty_driver = driver; - if (!(nf_console.flags & CON_ENABLED)) + if (!console_is_registered(&nf_console)) register_console(&nf_console); return 0; diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c index 0224fcb36e22..427dd5a61a38 100644 --- a/arch/um/kernel/kmsg_dump.c +++ b/arch/um/kernel/kmsg_dump.c @@ -16,20 +16,26 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, struct console *con; unsigned long flags; size_t len = 0; + int cookie; - /* only dump kmsg when no console is available */ - if (!console_trylock()) - return; + /* + * If no consoles are available to output crash information, dump + * the kmsg buffer to stdout. + */ - for_each_console(con) { - if(strcmp(con->name, "tty") == 0 && - (con->flags & (CON_ENABLED | CON_CONSDEV)) != 0) { + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + /* + * The ttynull console and disabled consoles are ignored + * since they cannot output. All other consoles are + * expected to output the crash information. + */ + if (strcmp(con->name, "ttynull") != 0 && + (console_srcu_read_flags(con) & CON_ENABLED)) { break; } } - - console_unlock(); - + console_srcu_read_unlock(cookie); if (con) return; diff --git a/drivers/firmware/efi/earlycon.c b/drivers/firmware/efi/earlycon.c index a52236e11e5f..4d6c5327471a 100644 --- a/drivers/firmware/efi/earlycon.c +++ b/drivers/firmware/efi/earlycon.c @@ -29,8 +29,8 @@ static void *efi_fb; */ static int __init efi_earlycon_remap_fb(void) { - /* bail if there is no bootconsole or it has been disabled already */ - if (!earlycon_console || !(earlycon_console->flags & CON_ENABLED)) + /* bail if there is no bootconsole or it was unregistered already */ + if (!earlycon_console || !console_is_registered(earlycon_console)) return 0; efi_fb = memremap(fb_base, screen_info.lfb_size, @@ -42,8 +42,8 @@ early_initcall(efi_earlycon_remap_fb); static int __init efi_earlycon_unmap_fb(void) { - /* unmap the bootconsole fb unless keep_bootcon has left it enabled */ - if (efi_fb && !(earlycon_console->flags & CON_ENABLED)) + /* unmap the bootconsole fb unless keep_bootcon left it registered */ + if (efi_fb && !console_is_registered(earlycon_console)) memunmap(efi_fb); return 0; } diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index bdff9ac5056d..4f4f79532c6c 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -332,10 +332,8 @@ static ssize_t enabled_store(struct config_item *item, } if (enabled) { /* true */ - if (nt->extended && !(netconsole_ext.flags & CON_ENABLED)) { - netconsole_ext.flags |= CON_ENABLED; + if (nt->extended && !console_is_registered(&netconsole_ext)) register_console(&netconsole_ext); - } /* * Skip netpoll_parse_options() -- all the attributes are @@ -869,7 +867,7 @@ static void write_msg(struct console *con, const char *msg, unsigned int len) static struct console netconsole_ext = { .name = "netcon_ext", - .flags = CON_EXTENDED, /* starts disabled, registered on first use */ + .flags = CON_ENABLED | CON_EXTENDED, .write = write_ext_msg, }; @@ -883,6 +881,7 @@ static int __init init_netconsole(void) { int err; struct netconsole_target *nt, *tmp; + bool extended = false; unsigned long flags; char *target_config; char *input = config; @@ -895,11 +894,12 @@ static int __init init_netconsole(void) goto fail; } /* Dump existing printks when we register */ - if (nt->extended) - netconsole_ext.flags |= CON_PRINTBUFFER | - CON_ENABLED; - else + if (nt->extended) { + extended = true; + netconsole_ext.flags |= CON_PRINTBUFFER; + } else { netconsole.flags |= CON_PRINTBUFFER; + } spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); @@ -915,7 +915,7 @@ static int __init init_netconsole(void) if (err) goto undonotifier; - if (netconsole_ext.flags & CON_ENABLED) + if (extended) register_console(&netconsole_ext); register_console(&netconsole); pr_info("network logging started\n"); @@ -945,7 +945,8 @@ static void __exit cleanup_netconsole(void) { struct netconsole_target *nt, *tmp; - unregister_console(&netconsole_ext); + if (console_is_registered(&netconsole_ext)) + unregister_console(&netconsole_ext); unregister_console(&netconsole); dynamic_netconsole_exit(); unregister_netdevice_notifier(&netconsole_netdev_notifier); diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c index 4802cfaa107f..a683e21df19c 100644 --- a/drivers/tty/hvc/hvc_console.c +++ b/drivers/tty/hvc/hvc_console.c @@ -264,8 +264,8 @@ static void hvc_port_destruct(struct tty_port *port) static void hvc_check_console(int index) { - /* Already enabled, bail out */ - if (hvc_console.flags & CON_ENABLED) + /* Already registered, bail out */ + if (console_is_registered(&hvc_console)) return; /* If this index is what the user requested, then register diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 94fbf0add2ce..74568292186f 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -565,7 +565,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) up->port.dev = dev; - if (uart_console_enabled(&up->port)) + if (uart_console_registered(&up->port)) pm_runtime_get_sync(up->port.dev); serial8250_apply_quirks(up); diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c index a5f380584cda..4f6e9bf57169 100644 --- a/drivers/tty/serial/earlycon.c +++ b/drivers/tty/serial/earlycon.c @@ -181,7 +181,7 @@ int __init setup_earlycon(char *buf) if (!buf || !buf[0]) return -EINVAL; - if (early_con.flags & CON_ENABLED) + if (console_is_registered(&early_con)) return -EALREADY; again: @@ -253,7 +253,7 @@ int __init of_setup_earlycon(const struct earlycon_id *match, bool big_endian; u64 addr; - if (early_con.flags & CON_ENABLED) + if (console_is_registered(&early_con)) return -EALREADY; spin_lock_init(&port->lock); diff --git a/drivers/tty/serial/kgdboc.c b/drivers/tty/serial/kgdboc.c index 7aa37be3216a..a3ed9b34e2ab 100644 --- a/drivers/tty/serial/kgdboc.c +++ b/drivers/tty/serial/kgdboc.c @@ -189,9 +189,27 @@ static int configure_kgdboc(void) if (kgdboc_register_kbd(&cptr)) goto do_register; + /* + * tty_find_polling_driver() can call uart_set_options() + * (via poll_init) to configure the uart. Take the console_list_lock + * in order to synchronize against register_console(), which can also + * configure the uart via uart_set_options(). This also allows safe + * traversal of the console list. + */ + console_list_lock(); + p = tty_find_polling_driver(cptr, &tty_line); - if (!p) + if (!p) { + console_list_unlock(); goto noconfig; + } + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); for_each_console(cons) { int idx; @@ -202,6 +220,10 @@ static int configure_kgdboc(void) } } + console_unlock(); + + console_list_unlock(); + kgdb_tty_driver = p; kgdb_tty_line = tty_line; @@ -449,6 +471,7 @@ static void kgdboc_earlycon_pre_exp_handler(void) { struct console *con; static bool already_warned; + int cookie; if (already_warned) return; @@ -461,9 +484,14 @@ static void kgdboc_earlycon_pre_exp_handler(void) * serial drivers might be OK with this, print a warning once per * boot if we detect this case. */ - for_each_console(con) + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { if (con == kgdboc_earlycon_io_ops.cons) - return; + break; + } + console_srcu_read_unlock(cookie); + if (con) + return; already_warned = true; pr_warn("kgdboc_earlycon is still using bootconsole\n"); @@ -528,7 +556,15 @@ static int __init kgdboc_earlycon_init(char *opt) * Look for a matching console, or if the name was left blank just * pick the first one we find. */ - console_lock(); + + /* + * Hold the console_list_lock to guarantee that no consoles are + * unregistered until the kgdboc_earlycon setup is complete. + * Trapping the exit() callback relies on exit() not being + * called until the trap is setup. This also allows safe + * traversal of the console list and race-free reading of @flags. + */ + console_list_lock(); for_each_console(con) { if (con->write && con->read && (con->flags & (CON_BOOT | CON_ENABLED)) && @@ -570,7 +606,7 @@ static int __init kgdboc_earlycon_init(char *opt) } unlock: - console_unlock(); + console_list_unlock(); /* Non-zero means malformed option so we always return zero */ return 0; diff --git a/drivers/tty/serial/pic32_uart.c b/drivers/tty/serial/pic32_uart.c index 2beada66c824..c38754d593ca 100644 --- a/drivers/tty/serial/pic32_uart.c +++ b/drivers/tty/serial/pic32_uart.c @@ -843,7 +843,7 @@ console_initcall(pic32_console_init); */ static int __init pic32_late_console_init(void) { - if (!(pic32_console.flags & CON_ENABLED)) + if (!console_is_registered(&pic32_console)) register_console(&pic32_console); return 0; @@ -919,7 +919,7 @@ static int pic32_uart_probe(struct platform_device *pdev) } #ifdef CONFIG_SERIAL_PIC32_CONSOLE - if (uart_console_enabled(port)) { + if (uart_console_registered(port)) { /* The peripheral clock has been enabled by console_setup, * so disable it till the port is used. */ diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c index 77d1363029f5..9c252c9ca95a 100644 --- a/drivers/tty/serial/samsung_tty.c +++ b/drivers/tty/serial/samsung_tty.c @@ -1732,7 +1732,7 @@ static void __init s3c24xx_serial_register_console(void) static void s3c24xx_serial_unregister_console(void) { - if (s3c24xx_serial_console.flags & CON_ENABLED) + if (console_is_registered(&s3c24xx_serial_console)) unregister_console(&s3c24xx_serial_console); } diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 179ee199df34..b9fbbee598b8 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -2223,11 +2223,11 @@ uart_set_options(struct uart_port *port, struct console *co, /* * Ensure that the serial-console lock is initialised early. * - * Note that the console-enabled check is needed because of kgdboc, - * which can end up calling uart_set_options() for an already enabled + * Note that the console-registered check is needed because + * kgdboc can call uart_set_options() for an already registered * console via tty_find_polling_driver() and uart_poll_init(). */ - if (!uart_console_enabled(port) && !port->console_reinit) + if (!uart_console_registered_locked(port) && !port->console_reinit) uart_port_spin_lock_init(port); memset(&termios, 0, sizeof(struct ktermios)); @@ -2573,7 +2573,7 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state, * successfully registered yet, try to re-register it. * It may be that the port was not available. */ - if (port->cons && !(port->cons->flags & CON_ENABLED)) + if (port->cons && !console_is_registered(port->cons)) register_console(port->cons); /* @@ -2956,7 +2956,7 @@ static ssize_t console_show(struct device *dev, mutex_lock(&port->mutex); uport = uart_port_check(state); if (uport) - console = uart_console_enabled(uport); + console = uart_console_registered(uport); mutex_unlock(&port->mutex); return sprintf(buf, "%c\n", console ? 'Y' : 'N'); @@ -2978,7 +2978,7 @@ static ssize_t console_store(struct device *dev, mutex_lock(&port->mutex); uport = uart_port_check(state); if (uport) { - oldconsole = uart_console_enabled(uport); + oldconsole = uart_console_registered(uport); if (oldconsole && !newconsole) { ret = unregister_console(uport->cons); } else if (!oldconsole && newconsole) { @@ -3086,7 +3086,7 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport) * If this port is in use as a console then the spinlock is already * initialised. */ - if (!uart_console_enabled(uport)) + if (!uart_console_registered(uport)) uart_port_spin_lock_init(uport); if (uport->cons && uport->dev) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 62f773286d44..76452fe2af86 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -3054,15 +3054,29 @@ static struct console serial_console = { }; #ifdef CONFIG_SUPERH +static char early_serial_buf[32]; + +static int early_serial_console_setup(struct console *co, char *options) +{ + /* + * This early console is always registered using the earlyprintk= + * parameter, which does not call add_preferred_console(). Thus + * @options is always NULL and the options for this early console + * are passed using a custom buffer. + */ + WARN_ON(options); + + return serial_console_setup(co, early_serial_buf); +} + static struct console early_serial_console = { .name = "early_ttySC", .write = serial_console_write, + .setup = early_serial_console_setup, .flags = CON_PRINTBUFFER, .index = -1, }; -static char early_serial_buf[32]; - static int sci_probe_earlyprintk(struct platform_device *pdev) { const struct plat_sci_port *cfg = dev_get_platdata(&pdev->dev); @@ -3074,8 +3088,6 @@ static int sci_probe_earlyprintk(struct platform_device *pdev) sci_init_single(pdev, &sci_ports[pdev->id], pdev->id, cfg, true); - serial_console_setup(&early_serial_console, early_serial_buf); - if (!strstr(early_serial_buf, "keep")) early_serial_console.flags |= CON_BOOT; diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index 2eff7cff57c4..0cbd1892c53b 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -1631,7 +1631,7 @@ static int cdns_uart_probe(struct platform_device *pdev) #ifdef CONFIG_SERIAL_XILINX_PS_UART_CONSOLE /* This is not port which is used for console that's why clean it up */ if (console_port == port && - !(cdns_uart_uart_driver.cons->flags & CON_ENABLED)) { + !console_is_registered(cdns_uart_uart_driver.cons)) { console_port = NULL; cdns_uart_console.index = -1; } diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 1ac6784ea1f9..8834ca24b24b 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3528,7 +3528,14 @@ static ssize_t show_cons_active(struct device *dev, struct console *c; ssize_t count = 0; - console_lock(); + /* + * Hold the console_list_lock to guarantee that no consoles are + * unregistered until all console processing is complete. + * This also allows safe traversal of the console list and + * race-free reading of @flags. + */ + console_list_lock(); + for_each_console(c) { if (!c->device) continue; @@ -3540,6 +3547,13 @@ static ssize_t show_cons_active(struct device *dev, if (i >= ARRAY_SIZE(cs)) break; } + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); @@ -3555,6 +3569,8 @@ static ssize_t show_cons_active(struct device *dev, } console_unlock(); + console_list_unlock(); + return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c index 7ef0a4b39762..f3e23be227d4 100644 --- a/drivers/usb/early/xhci-dbc.c +++ b/drivers/usb/early/xhci-dbc.c @@ -928,7 +928,7 @@ void __init early_xdbc_register_console(void) static void xdbc_unregister_console(void) { - if (early_xdbc_console.flags & CON_ENABLED) + if (console_is_registered(&early_xdbc_console)) unregister_console(&early_xdbc_console); } diff --git a/drivers/video/fbdev/xen-fbfront.c b/drivers/video/fbdev/xen-fbfront.c index 4d2694d904aa..8752d389e382 100644 --- a/drivers/video/fbdev/xen-fbfront.c +++ b/drivers/video/fbdev/xen-fbfront.c @@ -504,18 +504,14 @@ static void xenfb_make_preferred_console(void) if (console_set_on_cmdline) return; - console_lock(); + console_list_lock(); for_each_console(c) { if (!strcmp(c->name, "tty") && c->index == 0) break; } - console_unlock(); - if (c) { - unregister_console(c); - c->flags |= CON_CONSDEV; - c->flags &= ~CON_PRINTBUFFER; /* don't print again */ - register_console(c); - } + if (c) + console_force_preferred_locked(c); + console_list_unlock(); } static int xenfb_resume(struct xenbus_device *dev) diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index dfe6ce3505ce..e0758fe7936d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v) if (con->device) { const struct tty_driver *driver; int index; + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); driver = con->device(con, &index); + console_unlock(); + if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; @@ -63,7 +72,12 @@ static void *c_start(struct seq_file *m, loff_t *pos) struct console *con; loff_t off = 0; - console_lock(); + /* + * Hold the console_list_lock to guarantee safe traversal of the + * console list. SRCU cannot be used because there is no + * place to store the SRCU cookie. + */ + console_list_lock(); for_each_console(con) if (off++ == *pos) break; @@ -74,13 +88,14 @@ static void *c_start(struct seq_file *m, loff_t *pos) static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; + ++*pos; - return con->next; + return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) { - console_unlock(); + console_list_unlock(); } static const struct seq_operations consoles_op = { diff --git a/include/linux/console.h b/include/linux/console.h index 8c1686e2c233..1e36958aa656 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -15,6 +15,8 @@ #define _LINUX_CONSOLE_H_ 1 #include +#include +#include #include struct vc_data; @@ -124,44 +126,207 @@ static inline int con_debug_leave(void) /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) - * - * If a console driver is marked CON_BOOT then it will be auto-unregistered - * when the first real console is registered. This is for early-printk drivers. */ -#define CON_PRINTBUFFER (1) -#define CON_CONSDEV (2) /* Preferred console, /dev/console */ -#define CON_ENABLED (4) -#define CON_BOOT (8) -#define CON_ANYTIME (16) /* Safe to call when cpu is offline */ -#define CON_BRL (32) /* Used for a braille device */ -#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ +/** + * cons_flags - General console flags + * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate + * output of messages that were already shown by boot + * consoles or read by userspace via syslog() syscall. + * @CON_CONSDEV: Indicates that the console driver is backing + * /dev/console. + * @CON_ENABLED: Indicates if a console is allowed to print records. If + * false, the console also will not advance to later + * records. + * @CON_BOOT: Marks the console driver as early console driver which + * is used during boot before the real driver becomes + * available. It will be automatically unregistered + * when the real console driver is registered unless + * "keep_bootcon" parameter is used. + * @CON_ANYTIME: A misnomed historical flag which tells the core code + * that the legacy @console::write callback can be invoked + * on a CPU which is marked OFFLINE. That is misleading as + * it suggests that there is no contextual limit for + * invoking the callback. The original motivation was + * readiness of the per-CPU areas. + * @CON_BRL: Indicates a braille device which is exempt from + * receiving the printk spam for obvious reasons. + * @CON_EXTENDED: The console supports the extended output format of + * /dev/kmesg which requires a larger output buffer. + */ +enum cons_flags { + CON_PRINTBUFFER = BIT(0), + CON_CONSDEV = BIT(1), + CON_ENABLED = BIT(2), + CON_BOOT = BIT(3), + CON_ANYTIME = BIT(4), + CON_BRL = BIT(5), + CON_EXTENDED = BIT(6), +}; +/** + * struct console - The console descriptor structure + * @name: The name of the console driver + * @write: Write callback to output messages (Optional) + * @read: Read callback for console input (Optional) + * @device: The underlying TTY device driver (Optional) + * @unblank: Callback to unblank the console (Optional) + * @setup: Callback for initializing the console (Optional) + * @exit: Callback for teardown of the console (Optional) + * @match: Callback for matching a console (Optional) + * @flags: Console flags. See enum cons_flags + * @index: Console index, e.g. port number + * @cflag: TTY control mode flags + * @ispeed: TTY input speed + * @ospeed: TTY output speed + * @seq: Sequence number of the next ringbuffer record to print + * @dropped: Number of unreported dropped ringbuffer records + * @data: Driver private data + * @node: hlist node for the console li