From e031d3d7462fd0ad85fb162543983a73116d4db2 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Mon, 12 Dec 2022 11:29:43 +0100
Subject: [PATCH] cacULE-cachy

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/admin-guide/sysctl/kernel.rst |    4 +
 Documentation/scheduler/sched-CacULE.rst    |   76 ++
 include/linux/sched.h                       |   13 +
 include/linux/sched/sysctl.h                |   10 +
 include/linux/sched/topology.h              |    1 -
 init/Kconfig                                |   46 +
 kernel/Kconfig.hz                           |    4 +
 kernel/sched/core.c                         |   24 +-
 kernel/sched/debug.c                        |   13 +-
 kernel/sched/fair.c                         | 1304 +++++++++++++++----
 kernel/sched/features.h                     |    3 +-
 kernel/sched/pelt.c                         |   60 -
 kernel/sched/pelt.h                         |   80 +-
 kernel/sched/sched.h                        |   68 +-
 kernel/sysctl.c                             |   53 +
 15 files changed, 1340 insertions(+), 419 deletions(-)
 create mode 100644 Documentation/scheduler/sched-CacULE.rst

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 98d1b198b2b4..545be5a65935 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1094,6 +1094,10 @@ reboot-cmd (SPARC only)
 ROM/Flash boot loader. Maybe to tell it what to do after
 rebooting. ???
 
+sched_interactivity_factor (CacULE scheduler only)
+==================================================
+Sets the value *m* for interactivity score calculations. See
+Figure 1 in https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf
 
 sched_energy_aware
 ==================
diff --git a/Documentation/scheduler/sched-CacULE.rst b/Documentation/scheduler/sched-CacULE.rst
new file mode 100644
index 000000000000..82b0847c468a
--- /dev/null
+++ b/Documentation/scheduler/sched-CacULE.rst
@@ -0,0 +1,76 @@
+======================================
+The CacULE Scheduler by Hamad Al Marri.
+======================================
+
+1.  Overview
+=============
+
+The CacULE CPU scheduler is based on interactivity score mechanism.
+The interactivity score is inspired by the ULE scheduler (FreeBSD
+scheduler).
+
+1.1 About CacULE Scheduler
+--------------------------
+
+  - Each CPU has its own runqueue.
+
+  - NORMAL runqueue is a linked list of sched_entities (instead of RB-Tree).
+
+  - RT and other runqueues are just the same as the CFS's.
+
+  - Wake up tasks preempt currently running tasks if its interactivity score value
+    is higher.
+
+
+1.2. Complexity
+----------------
+
+The complexity of Enqueue and Dequeue a task is O(1).
+
+The complexity of pick the next task is in O(n), where n is the number of tasks
+in a runqueue (each CPU has its own runqueue).
+
+Note: O(n) sounds scary, but usually for a machine with 4 CPUS where it is used
+for desktop or mobile jobs, the maximum number of runnable tasks might not
+exceeds 10 (at the pick next run time) - the idle tasks are excluded since they
+are dequeued when sleeping and enqueued when they wake up.
+
+
+2. The CacULE Interactivity Score
+=======================================================
+
+The interactivity score is inspired by the ULE scheduler (FreeBSD scheduler).
+For more information see: https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf
+CacULE doesn't replace CFS with ULE, it only changes the CFS' pick next task
+mechanism to ULE's interactivity score mechanism for picking next task to run.
+
+
+2.3 sched_interactivity_factor
+=================
+Sets the value *m* for interactivity score calculations. See Figure 1 in
+https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf
+The default value of in CacULE is 10 which means that the Maximum Interactive
+Score is 20 (since m = Maximum Interactive Score / 2).
+You can tune sched_interactivity_factor with sysctl command:
+
+	sysctl kernel.sched_interactivity_factor=50
+
+This command changes the sched_interactivity_factor from 10 to 50.
+
+
+3. Scheduling policies
+=======================
+
+CacULE some CFS, implements three scheduling policies:
+
+  - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
+    policy that is used for regular tasks.
+
+  - SCHED_BATCH: Does not preempt nearly as often as regular tasks
+    would, thereby allowing tasks to run longer and make better use of
+    caches but at the cost of interactivity. This is well suited for
+    batch jobs.
+
+  - SCHED_IDLE: This is even weaker than nice 19, but its not a true
+    idle timer scheduler in order to avoid to get into priority
+    inversion problems which would deadlock the machine.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5affff14993d..850473545f54 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -544,10 +544,23 @@ struct sched_statistics {
 #endif /* CONFIG_SCHEDSTATS */
 } ____cacheline_aligned;
 
+#ifdef CONFIG_CACULE_SCHED
+struct cacule_node {
+	struct cacule_node* 		next;
+	struct cacule_node* 		prev;
+	u64				cacule_start_time;
+	u64				last_run;
+	u64				vruntime;
+};
+#endif
+
 struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
 	struct rb_node			run_node;
+#ifdef CONFIG_CACULE_SCHED
+	struct cacule_node		cacule_node;
+#endif
 	struct list_head		group_node;
 	unsigned int			on_rq;
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 303ee7dd0c7e..45a3a06dba5f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -14,6 +14,16 @@ extern unsigned long sysctl_hung_task_timeout_secs;
 enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
+#ifdef CONFIG_CACULE_SCHED
+extern unsigned int interactivity_factor;
+extern unsigned int cacule_max_lifetime;
+extern unsigned int cache_factor;
+extern unsigned int cache_divisor;
+extern unsigned int starve_factor;
+extern unsigned int starve_divisor;
+extern int cacule_yield;
+#endif
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 816df6cc444e..56cffe42abbc 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -81,7 +81,6 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
-	int		nr_idle_scan;
 };
 
 struct sched_domain {
diff --git a/init/Kconfig b/init/Kconfig
index 5cf5c424fbf1..8e49189e96df 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -884,6 +884,51 @@ config UCLAMP_BUCKETS_COUNT
 
 endmenu
 
+config CACULE_SCHED
+	bool "CacULE CPU scheduler"
+	default y
+	help
+	  The CacULE CPU scheduler is based on interactivity score mechanism.
+	  The interactivity score is inspired by the ULE scheduler (FreeBSD
+	  scheduler).
+
+	  If unsure, say Y here.
+
+config CACULE_RDB
+	bool "RDB (Response Driven Balancer)"
+	default y
+	depends on CACULE_SCHED
+	help
+	  This is an experimental load balancer for CacULE. It is a lightweight
+	  load balancer which is a replacement of CFS load balancer. It migrates
+	  tasks based on their interactivity scores.
+
+	  If unsure, say Y here.
+
+config RDB_INTERVAL
+	int "RDB load balancer interval"
+	default 19
+	depends on CACULE_RDB
+	help
+	  This is an interval to control load balance time period.
+	  The trigger_load_balance runs in every tick. For High HZ values, the
+	  load balance could be overwhelming. RDB load balance includes rq locking
+	  which can reduce the performance. The balance interval can help to avoid
+	  running load balance on every tick. For example, RDB_INTERVAL=3 will
+	  only run load balance every 3ms. Setting RDB_INTERVAL depends on HZ.
+	  If you want load balancer run every 2ms while HZ=500 then it is not
+	  needed and better to set RDB_INTERVAL=0 since 500HZ already (1000ms
+	  / 500HZ = 2ms). However, if you have 1000HZ and want to avoid load
+	  balancer from running every 1ms, you could set RDB_INTERVAL=4ms for
+	  example to make load balancer run every 4ms. Less RDB_INTERVAL values
+	  (or 0 to disable) could make sure tasks are balanced ASAP, but with
+	  the cost of locking/blocking time. High RDB_INTERVAL values can relax
+	  balancing locking but with the cost of imbalanced workload for that
+	  period of time (i.e. if RDB_INTERVAL=100ms) there will be no balancing
+	  for 100ms (except for newidle_balance which is not effected by RDB_INTERVAL).
+
+	  If in doubt, use the default value.
+
 #
 # For architectures that want to enable the support for NUMA-affine scheduler
 # balancing logic:
@@ -1317,6 +1362,7 @@ config SCHED_AUTOGROUP
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
+	default y
 	help
 	  This option optimizes the scheduler for common desktop workloads by
 	  automatically creating and populating task groups.  This separation
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 0f78364efd4f..56e597b52210 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -67,6 +67,9 @@ choice
 	 1000 Hz is the preferred choice for desktop systems and other
 	 systems requiring fast interactive responses to events.
 
+	config HZ_2000
+		bool "2000 HZ"
+
 endchoice
 
 config HZ
@@ -78,6 +81,7 @@ config HZ
 	default 600 if HZ_600
 	default 750 if HZ_750
 	default 1000 if HZ_1000
+	default 2000 if HZ_2000
 
 config SCHED_HRTICK
 	def_bool HIGH_RES_TIMERS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ac4446588b5f..5dfa80afe151 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -723,7 +723,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		update_irq_load_avg(rq, irq_delta + steal);
 #endif
-	update_rq_clock_task_mult(rq, delta);
+	update_rq_clock_pelt(rq, delta);
 }
 
 void update_rq_clock(struct rq *rq)
@@ -4375,6 +4375,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+
+#ifdef CONFIG_CACULE_SCHED
+	p->se.cacule_node.vruntime	= 0;
+#endif
+
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4730,6 +4735,10 @@ void wake_up_new_task(struct task_struct *p)
 	update_rq_clock(rq);
 	post_init_entity_util_avg(p);
 
+#ifdef CONFIG_CACULE_SCHED
+	p->se.cacule_node.cacule_start_time = sched_clock();
+#endif
+
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -5568,7 +5577,9 @@ static void sched_tick_remote(struct work_struct *work)
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr;
 	struct rq_flags rf;
+#if !defined(CONFIG_CACULE_SCHED)
 	u64 delta;
+#endif
 	int os;
 
 	/*
@@ -5588,6 +5599,7 @@ static void sched_tick_remote(struct work_struct *work)
 
 	update_rq_clock(rq);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	if (!is_idle_task(curr)) {
 		/*
 		 * Make sure the next tick runs within a reasonable
@@ -5596,6 +5608,8 @@ static void sched_tick_remote(struct work_struct *work)
 		delta = rq_clock_task(rq) - curr->se.exec_start;
 		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
 	}
+#endif
+
 	curr->sched_class->task_tick(rq, curr, 0);
 
 	calc_load_nohz_remote(rq);
@@ -9710,6 +9724,14 @@ void __init sched_init(void)
 	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
 #endif
 
+#ifdef CONFIG_CACULE_SCHED
+#ifdef CONFIG_CACULE_RDB
+	printk(KERN_INFO "CacULE CPU scheduler (RDB) v6.0 by Hamad Al Marri.");
+#else
+	printk(KERN_INFO "CacULE CPU scheduler v6.0 by Hamad Al Marri.");
+#endif
+#endif
+
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a8377d0e5ebd..a6b6cc625aac 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -582,8 +582,11 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
-		spread, rq0_min_vruntime, spread0;
+	s64 MIN_vruntime = -1, max_vruntime = -1,
+#if !defined(CONFIG_CACULE_SCHED)
+	min_vruntime, rq0_min_vruntime, spread0,
+#endif
+	spread;
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_entity *last;
 	unsigned long flags;
@@ -604,21 +607,27 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		max_vruntime = last->vruntime;
+#if !defined(CONFIG_CACULE_SCHED)
 	min_vruntime = cfs_rq->min_vruntime;
 	rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
+#endif
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
 			SPLIT_NS(MIN_vruntime));
+#if !defined(CONFIG_CACULE_SCHED)
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
 			SPLIT_NS(min_vruntime));
+#endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
 			SPLIT_NS(max_vruntime));
 	spread = max_vruntime - MIN_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
 			SPLIT_NS(spread));
+#if !defined(CONFIG_CACULE_SCHED)
 	spread0 = min_vruntime - rq0_min_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
 			SPLIT_NS(spread0));
+#endif
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9d01f3f27ea9..81ac3cca8c89 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -19,6 +19,10 @@
  *
  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ *  CacULE enhancements CPU cache and scheduler based on
+ *  Interactivity Score.
+ *  (C) 2020 Hamad Al Marri <hamad.s.almarri@gmail.com>
  */
 #include <linux/energy_model.h>
 #include <linux/mmap_lock.h>
@@ -56,6 +60,21 @@
 #include "stats.h"
 #include "autogroup.h"
 
+#ifdef CONFIG_CACULE_SCHED
+unsigned int __read_mostly cacule_max_lifetime		= 11000; // in ms
+unsigned int __read_mostly interactivity_factor		= 65536;
+int __read_mostly cacule_yield				= 1;
+
+#define YIELD_MARK					0x8000000000000000ULL
+#define YIELD_UNMARK					0x7FFFFFFFFFFFFFFFULL
+
+unsigned int __read_mostly cache_factor			= 13107;
+unsigned int __read_mostly cache_divisor		= 1000000; // 1ms
+
+unsigned int __read_mostly starve_factor		= 19660;
+unsigned int __read_mostly starve_divisor		= 3000000; // 3ms
+#endif
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -138,7 +157,11 @@ unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
 static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
 #endif
 
+#ifdef CONFIG_CACULE_SCHED
+const_debug unsigned int sysctl_sched_migration_cost	= 200000UL;
+#else
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+#endif
 
 int sched_thermal_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
@@ -353,6 +376,14 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
 
 const struct sched_class fair_sched_class;
 
+
+#ifdef CONFIG_CACULE_SCHED
+static inline struct sched_entity *se_of(struct cacule_node *cn)
+{
+	return container_of(cn, struct sched_entity, cacule_node);
+}
+#endif
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -579,7 +610,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
  */
-
+#if !defined(CONFIG_CACULE_SCHED)
 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - max_vruntime);
@@ -631,15 +662,230 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 	}
 
 	/* ensure we never gain time by being placed backwards. */
-	u64_u32_store(cfs_rq->min_vruntime,
-		      max_vruntime(cfs_rq->min_vruntime, vruntime));
+	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+	smp_wmb();
+	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 
 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 {
 	return entity_before(__node_2_se(a), __node_2_se(b));
 }
+#endif /* CONFIG_CACULE_SCHED */
+
+#ifdef CONFIG_CACULE_SCHED
+static unsigned int
+calc_interactivity(u64 now, struct cacule_node *se)
+{
+	u64 l_se, vr_se, sleep_se = 1ULL, u64_factor_m, _2m;
+	unsigned int score_se;
+
+	/*
+	 * in case of vruntime==0, logical OR with 1 would
+	 * make sure that the least sig. bit is 1
+	 */
+	l_se		= now - se->cacule_start_time;
+	vr_se		= se->vruntime | 1;
+	u64_factor_m	= interactivity_factor;
+	_2m		= u64_factor_m << 1;
+
+	/* safety check */
+	if (likely(l_se > vr_se))
+		sleep_se = (l_se - vr_se) | 1;
+
+	if (sleep_se >= vr_se)
+		score_se = u64_factor_m / (sleep_se / vr_se);
+	else
+		score_se = _2m - (u64_factor_m / (vr_se / sleep_se));
+
+	return score_se;
+}
+
+static unsigned int
+calc_cache_score(u64 now, struct cacule_node *cn)
+{
+	struct sched_entity *se = se_of(cn);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 c_div = cache_divisor;
+	u64 cache_period = 1ULL;
+	u64 u64_factor_m = cache_factor;
+	u64 _2m = u64_factor_m << 1;
+	unsigned int score;
+
+	if (!cache_factor)
+		return 0;
+
+	if (se == cfs_rq->curr)
+		return 0;
+
+	cache_period = (now - se->exec_start) | 1;
+
+	if (c_div >= cache_period)
+		score = u64_factor_m / (c_div / cache_period);
+	else
+		score = _2m - (u64_factor_m / (cache_period / c_div));
+
+	return score;
+}
+
+static unsigned int
+calc_starve_score(u64 now, struct cacule_node *cn)
+{
+	struct sched_entity *se = se_of(cn);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 s_div = starve_divisor;
+	u64 starving = 1ULL;
+	u64 u64_factor_m = starve_factor;
+	u64 _2m = u64_factor_m << 1;
+	unsigned int score;
+
+	if (!starve_factor)
+		return 0;
+
+	if (se == cfs_rq->curr)
+		return _2m;
+
+	starving = (now - cn->last_run) | 1;
+
+	if (s_div >= starving)
+		score = _2m - (u64_factor_m / (s_div / starving));
+	else
+		score = u64_factor_m / (starving / s_div);
+
+	return score;
+}
+
+static inline int cn_has_idle_policy(struct cacule_node *cn)
+{
+	struct sched_entity *se = se_of(cn);
+
+	if (!entity_is_task(se))
+		return false;
+
+	return task_has_idle_policy(task_of(se));
+}
+
+/*
+ * Does se have lower interactivity score value (i.e. interactive) than curr? If yes, return 1,
+ * otherwise return -1
+ * se is before curr if se has lower interactivity score value
+ * the lower score, the more interactive
+ */
+static inline int
+entity_before(u64 now, struct cacule_node *curr, struct cacule_node *se)
+{
+	unsigned int score_curr, score_se;
+	int diff;
+	int is_curr_idle = cn_has_idle_policy(curr);
+	int is_se_idle   = cn_has_idle_policy(se);
+
+	/* if curr is normal but se is idle class, then no */
+	if (!is_curr_idle && is_se_idle)
+		return -1;
+
+	/* if curr is idle class and se is normal, then yes */
+	if (is_curr_idle && !is_se_idle)
+		return 1;
+
+	score_curr	 = calc_interactivity(now, curr);
+	score_curr	+= calc_cache_score(now, curr);
+	score_curr	+= calc_starve_score(now, curr);
+
+	score_se	 = calc_interactivity(now, se);
+	score_se	+= calc_cache_score(now, se);
+	score_se	+= calc_starve_score(now, se);
+
+	diff		= score_se - score_curr;
+
+	if (diff < 0)
+		return 1;
+
+	return -1;
+}
+
+#ifdef CONFIG_CACULE_RDB
+static void update_IS(struct rq *rq)
+{
+	struct list_head *tasks = &rq->cfs_tasks;
+	struct task_struct *p, *to_migrate = NULL;
+	unsigned int max_IS = ~0, temp_IS;
+
+	list_for_each_entry(p, tasks, se.group_node) {
+		if (task_running(rq, p))
+			continue;
+
+		temp_IS = calc_interactivity(sched_clock(), &p->se.cacule_node);
+		if (temp_IS < max_IS) {
+			to_migrate = p;
+			max_IS = temp_IS;
+		}
+	}
+
+	if (to_migrate) {
+		WRITE_ONCE(rq->max_IS_score, max_IS);
+		WRITE_ONCE(rq->to_migrate_task, to_migrate);
+	} else if (rq->max_IS_score != ~0) {
+		WRITE_ONCE(rq->max_IS_score, ~0);
+		WRITE_ONCE(rq->to_migrate_task, NULL);
+	}
+}
+#endif
+
+/*
+ * Enqueue an entity
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se)
+{
+	struct cacule_node *se = &(_se->cacule_node);
+
+	se->next = NULL;
+	se->prev = NULL;
+
+	if (cfs_rq->head) {
+		// insert se at head
+		se->next		= cfs_rq->head;
+		cfs_rq->head->prev	= se;
+
+		// lastly reset the head
+		cfs_rq->head		= se;
+	} else {
+		// if empty rq
+		cfs_rq->head = se;
+	}
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se)
+{
+	struct cacule_node *se = &(_se->cacule_node);
+
+	// if only one se in rq
+	if (cfs_rq->head->next == NULL) {
+		cfs_rq->head = NULL;
+	} else if (se == cfs_rq->head) {
+		// if it is the head
+		cfs_rq->head		= cfs_rq->head->next;
+		cfs_rq->head->prev	= NULL;
+	} else {
+		// if in the middle
+		struct cacule_node *prev = se->prev;
+		struct cacule_node *next = se->next;
+
+		prev->next = next;
+		if (next)
+			next->prev = prev;
+	}
+}
+
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+{
+	if (!cfs_rq->head)
+		return NULL;
 
+	return se_of(cfs_rq->head);
+}
+#else
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -672,16 +918,24 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 
 	return __node_2_se(next);
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 #ifdef CONFIG_SCHED_DEBUG
 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
+#ifdef CONFIG_CACULE_SCHED
+	if (!cfs_rq->head)
+		return NULL;
+
+	return se_of(cfs_rq->head);
+#else
 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
 
 	if (!last)
 		return NULL;
 
 	return __node_2_se(last);
+#endif /* CONFIG_CACULE_SCHED */
 }
 
 /**************************************************************
@@ -782,6 +1036,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return slice;
 }
 
+#if !defined(CONFIG_CACULE_SCHED)
 /*
  * We calculate the vruntime slice of a to-be-inserted task.
  *
@@ -791,6 +1046,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 #include "pelt.h"
 #ifdef CONFIG_SMP
@@ -894,14 +1150,55 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_CACULE_SCHED
+static void normalize_lifetime(u64 now, struct sched_entity *se)
+{
+	struct cacule_node *cn = &se->cacule_node;
+	u64 max_life_ns, life_time, old_hrrn_x;
+	s64 diff;
+
+	/*
+	 * left shift 20 bits is approximately = * 1000000
+	 * we don't need the precision of life time
+	 * Ex. for 30s, with left shift (20bits) == 31.457s
+	 */
+	max_life_ns	= ((u64) cacule_max_lifetime) << 20;
+	life_time	= now - cn->cacule_start_time;
+	diff		= life_time - max_life_ns;
+
+	if (diff > 0) {
+		// unmark YIELD. No need to check or remark since
+		// this normalize action doesn't happen very often
+		cn->vruntime &= YIELD_UNMARK;
+
+		// multiply life_time by 1024 for more precision
+		old_hrrn_x = (life_time << 7) / ((cn->vruntime >> 3) | 1);
+
+		// reset life to half max_life (i.e ~15s)
+		cn->cacule_start_time = now - (max_life_ns >> 1);
+
+		// avoid division by zero
+		if (old_hrrn_x == 0) old_hrrn_x = 1;
+
+		// reset vruntime based on old hrrn ratio
+		cn->vruntime = (max_life_ns << 9) / old_hrrn_x;
+	}
+}
+#endif /* CONFIG_CACULE_SCHED */
+
 /*
  * Update the current task's runtime statistics.
  */
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
+#ifdef CONFIG_CACULE_SCHED
+	u64 now = sched_clock();
+	u64 delta_exec, delta_fair;
+#else
 	u64 now = rq_clock_task(rq_of(cfs_rq));
 	u64 delta_exec;
+#endif
 
 	if (unlikely(!curr))
 		return;
@@ -923,8 +1220,16 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq->exec_clock, delta_exec);
 
+#ifdef CONFIG_CACULE_SCHED
+	curr->cacule_node.last_run = now;
+	delta_fair = calc_delta_fair(delta_exec, curr);
+	curr->vruntime += delta_fair;
+	curr->cacule_node.vruntime += delta_fair;
+	normalize_lifetime(now, curr);
+#else
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_min_vruntime(cfs_rq);
+#endif
 
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
@@ -1025,7 +1330,6 @@ update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int fl
 static inline void
 update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-
 	if (!schedstat_enabled())
 		return;
 
@@ -1060,7 +1364,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * We are starting a new run period:
 	 */
+#ifdef CONFIG_CACULE_SCHED
+	se->exec_start = sched_clock();
+	se->cacule_node.last_run = sched_clock();
+#else
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
+#endif
 }
 
 /**************************************************
@@ -3864,34 +4173,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 }
 
 #ifdef CONFIG_SMP
-static inline bool load_avg_is_decayed(struct sched_avg *sa)
-{
-	if (sa->load_sum)
-		return false;
-
-	if (sa->util_sum)
-		return false;
-
-	if (sa->runnable_sum)
-		return false;
-
-	/*
-	 * _avg must be null when _sum are null because _avg = _sum / divider
-	 * Make sure that rounding and/or propagation of PELT values never
-	 * break this.
-	 */
-	SCHED_WARN_ON(sa->load_avg ||
-		      sa->util_avg ||
-		      sa->runnable_avg);
-
-	return true;
-}
-
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
-	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
-				 cfs_rq->last_update_time_copy);
-}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
@@ -3924,12 +4205,27 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	if (cfs_rq->load.weight)
 		return false;
 
-	if (!load_avg_is_decayed(&cfs_rq->avg))
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->avg.runnable_sum)
 		return false;
 
 	if (child_cfs_rq_on_list(cfs_rq))
 		return false;
 
+	/*
+	 * _avg must be null when _sum are null because _avg = _sum / divider
+	 * Make sure that rounding and/or propagation of PELT values never
+	 * break this.
+	 */
+	SCHED_WARN_ON(cfs_rq->avg.load_avg ||
+		      cfs_rq->avg.util_avg ||
+		      cfs_rq->avg.runnable_avg);
+
 	return true;
 }
 
@@ -3987,9 +4283,27 @@ void set_task_rq_fair(struct sched_entity *se,
 	if (!(se->avg.last_update_time && prev))
 		return;
 
-	p_last_update_time = cfs_rq_last_update_time(prev);
-	n_last_update_time = cfs_rq_last_update_time(next);
+#ifndef CONFIG_64BIT
+	{
+		u64 p_last_update_time_copy;
+		u64 n_last_update_time_copy;
+
+		do {
+			p_last_update_time_copy = prev->load_last_update_time_copy;
+			n_last_update_time_copy = next->load_last_update_time_copy;
+
+			smp_rmb();
+
+			p_last_update_time = prev->avg.last_update_time;
+			n_last_update_time = next->avg.last_update_time;
 
+		} while (p_last_update_time != p_last_update_time_copy ||
+			 n_last_update_time != n_last_update_time_copy);
+	}
+#else
+	p_last_update_time = prev->avg.last_update_time;
+	n_last_update_time = next->avg.last_update_time;
+#endif
 	__update_load_avg_blocked_se(p_last_update_time, se);
 	se->avg.last_update_time = n_last_update_time;
 }
@@ -4268,89 +4582,6 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_NO_HZ_COMMON
-static inline void migrate_se_pelt_lag(struct sched_entity *se)
-{
-	u64 throttled = 0, now, lut;
-	struct cfs_rq *cfs_rq;
-	struct rq *rq;
-	bool is_idle;
-
-	if (load_avg_is_decayed(&se->avg))
-		return;
-
-	cfs_rq = cfs_rq_of(se);
-	rq = rq_of(cfs_rq);
-
-	rcu_read_lock();
-	is_idle = is_idle_task(rcu_dereference(rq->curr));
-	rcu_read_unlock();
-
-	/*
-	 * The lag estimation comes with a cost we don't want to pay all the
-	 * time. Hence, limiting to the case where the source CPU is idle and
-	 * we know we are at the greatest risk to have an outdated clock.
-	 */
-	if (!is_idle)
-		return;
-
-	/*
-	 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
-	 *
-	 *   last_update_time (the cfs_rq's last_update_time)
-	 *	= cfs_rq_clock_pelt()@cfs_rq_idle
-	 *      = rq_clock_pelt()@cfs_rq_idle
-	 *        - cfs->throttled_clock_pelt_time@cfs_rq_idle
-	 *
-	 *   cfs_idle_lag (delta between rq's update and cfs_rq's update)
-	 *      = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
-	 *
-	 *   rq_idle_lag (delta between now and rq's update)
-	 *      = sched_clock_cpu() - rq_clock()@rq_idle
-	 *
-	 * We can then write:
-	 *
-	 *    now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
-	 *          sched_clock_cpu() - rq_clock()@rq_idle
-	 * Where:
-	 *      rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
-	 *      rq_clock()@rq_idle      is rq->clock_idle
-	 *      cfs->throttled_clock_pelt_time@cfs_rq_idle
-	 *                              is cfs_rq->throttled_pelt_idle
-	 */
-
-#ifdef CONFIG_CFS_BANDWIDTH
-	throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
-	/* The clock has been stopped for throttling */
-	if (throttled == U64_MAX)
-		return;
-#endif
-	now = u64_u32_load(rq->clock_pelt_idle);
-	/*
-	 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
-	 * is observed the old clock_pelt_idle value and the new clock_idle,
-	 * which lead to an underestimation. The opposite would lead to an
-	 * overestimation.
-	 */
-	smp_rmb();
-	lut = cfs_rq_last_update_time(cfs_rq);
-
-	now -= throttled;
-	if (now < lut)
-		/*
-		 * cfs_rq->avg.last_update_time is more recent than our
-		 * estimation, let's use it.
-		 */
-		now = lut;
-	else
-		now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
-
-	__update_load_avg_blocked_se(now, se);
-}
-#else
-static void migrate_se_pelt_lag(struct sched_entity *se) {}
-#endif
-
 /**
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  * @now: current time, as per cfs_rq_clock_pelt()
@@ -4424,9 +4655,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	}
 
 	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-	u64_u32_store_copy(sa->last_update_time,
-			   cfs_rq->last_update_time_copy,
-			   sa->last_update_time);
+
+#ifndef CONFIG_64BIT
+	smp_wmb();
+	cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
+
 	return decayed;
 }
 
@@ -4566,6 +4800,27 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	}
 }
 
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	u64 last_update_time_copy;
+	u64 last_update_time;
+
+	do {
+		last_update_time_copy = cfs_rq->load_last_update_time_copy;
+		smp_rmb();
+		last_update_time = cfs_rq->avg.last_update_time;
+	} while (last_update_time != last_update_time_copy);
+
+	return last_update_time;
+}
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.last_update_time;
+}
+#endif
+
 /*
  * Synchronize entity load avg of dequeued entity without locking
  * the previous rq.
@@ -4983,7 +5238,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_CACULE_SCHED)
 	s64 d = se->vruntime - cfs_rq->min_vruntime;
 
 	if (d < 0)
@@ -4994,6 +5249,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #endif
 }
 
+#if !defined(CONFIG_CACULE_SCHED)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
@@ -5030,6 +5286,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	/* ensure we never gain time by being placed backwards. */
 	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
@@ -5068,18 +5325,23 @@ static inline bool cfs_bandwidth_used(void);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+#if !defined(CONFIG_CACULE_SCHED)
 	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+#endif
 	bool curr = cfs_rq->curr == se;
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * If we're the current task, we must renormalise before calling
 	 * update_curr().
 	 */
 	if (renorm && curr)
 		se->vruntime += cfs_rq->min_vruntime;
+#endif
 
 	update_curr(cfs_rq);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * Otherwise, renormalise after, such that we're placed at the current
 	 * moment in time, instead of some random moment in the past. Being
@@ -5088,6 +5350,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
+#endif
 
 	/*
 	 * When enqueuing a sched_entity, we must:
@@ -5103,8 +5366,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_group(se);
 	account_entity_enqueue(cfs_rq, se);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
+#endif
 
 	check_schedstat_required();
 	update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -5120,6 +5385,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	}
 }
 
+#if !defined(CONFIG_CACULE_SCHED)
 static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
@@ -5164,6 +5430,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (cfs_rq->skip == se)
 		__clear_buddies_skip(se);
 }
+#endif /* !CONFIG_CACULE_SCHED */
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
@@ -5194,13 +5461,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	update_stats_dequeue_fair(cfs_rq, se, flags);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	clear_buddies(cfs_rq, se);
+#endif
 
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * Normalize after update_curr(); which will also have moved
 	 * min_vruntime if @se is the one holding it back. But before doing
@@ -5209,12 +5479,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
+#endif
 
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
 	update_cfs_group(se);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
 	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
@@ -5223,11 +5495,23 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
 		update_min_vruntime(cfs_rq);
-
-	if (cfs_rq->nr_running == 0)
-		update_idle_cfs_rq_clock_pelt(cfs_rq);
+#endif
 }
 
+#ifdef CONFIG_CACULE_SCHED
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr);
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	if (pick_next_entity(cfs_rq, curr) != curr)
+		resched_curr(rq_of(cfs_rq));
+}
+#else
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -5273,11 +5557,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (delta > ideal_runtime)
 		resched_curr(rq_of(cfs_rq));
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+#if !defined(CONFIG_CACULE_SCHED)
 	clear_buddies(cfs_rq, se);
+#endif
 
 	/* 'current' is not kept within the tree. */
 	if (se->on_rq) {
@@ -5312,6 +5599,31 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
+#ifdef CONFIG_CACULE_SCHED
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	struct cacule_node *se = cfs_rq->head;
+	struct cacule_node *next;
+	u64 now = sched_clock();
+
+	if (!se)
+		return curr;
+
+	next = se->next;
+	while (next) {
+		if (entity_before(now, se, next) == 1)
+			se = next;
+
+		next = next->next;
+	}
+
+	if (curr && entity_before(now, se, &curr->cacule_node) == 1)
+		return curr;
+
+	return se_of(se);
+}
+#else
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 
@@ -5370,6 +5682,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 	return se;
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
@@ -6485,9 +6798,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
+
+#ifdef CONFIG_CACULE_RDB
+	update_IS(rq);
+#endif
 }
 
+#if !defined(CONFIG_CACULE_SCHED)
 static void set_next_buddy(struct sched_entity *se);
+#endif
 
 /*
  * The dequeue_task method is called before nr_running is
@@ -6522,12 +6841,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
 			se = parent_entity(se);
+#if !defined(CONFIG_CACULE_SCHED)
 			/*
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
 			 */
 			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
 				set_next_buddy(se);
+#endif
 			break;
 		}
 		flags |= DEQUEUE_SLEEP;
@@ -6562,6 +6883,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 dequeue_throttle:
 	util_est_update(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
+
+#ifdef CONFIG_CACULE_RDB
+	update_IS(rq);
+#endif
 }
 
 #ifdef CONFIG_SMP
@@ -6647,6 +6972,7 @@ static unsigned long capacity_of(int cpu)
 	return cpu_rq(cpu)->cpu_capacity;
 }
 
+#if !defined(CONFIG_CACULE_SCHED)
 static void record_wakee(struct task_struct *p)
 {
 	/*
@@ -6693,6 +7019,7 @@ static int wake_wide(struct task_struct *p)
 		return 0;
 	return 1;
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 /*
  * The purpose of wake_affine() is to quickly determine on which CPU we can run
@@ -7056,7 +7383,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 {
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
-	struct sched_domain_shared *sd_share;
 	struct rq *this_rq = this_rq();
 	int this = smp_processor_id();
 	struct sched_domain *this_sd = NULL;
@@ -7096,17 +7422,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 		time = cpu_clock(this);
 	}
 
-	if (sched_feat(SIS_UTIL)) {
-		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
-		if (sd_share) {
-			/* because !--nr is the condition to stop scan */
-			nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
-			/* overloaded LLC is unlikely to have idle cpu/core */
-			if (nr == 1)
-				return -1;
-		}
-	}
-
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7394,6 +7709,7 @@ struct energy_env {
 	unsigned long pd_cap;
 };
 
+#if !defined(CONFIG_CACULE_SCHED)
 /*
  * Compute the task busy time for compute_energy(). This time cannot be
  * injected directly into effective_cpu_util() because of the IRQ scaling.
@@ -7699,6 +8015,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 
 	return target;
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -7725,6 +8042,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	 * required for stable ->cpus_allowed
 	 */
 	lockdep_assert_held(&p->pi_lock);
+
+#if !defined(CONFIG_CACULE_SCHED)
 	if (wake_flags & WF_TTWU) {
 		record_wakee(p);
 
@@ -7737,6 +8056,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
 	}
+#endif /* CONFIG_CACULE_SCHED */
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
@@ -7783,8 +8103,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
  */
 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
-	struct sched_entity *se = &p->se;
-
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * As blocked tasks retain absolute vruntime the migration needs to
 	 * deal with this by subtracting the old and adding the new
@@ -7792,32 +8111,44 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	 * the task on the new runqueue.
 	 */
 	if (READ_ONCE(p->__state) == TASK_WAKING) {
+		struct sched_entity *se = &p->se;
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+		u64 min_vruntime_copy;
 
-		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
+		do {
+			min_vruntime_copy = cfs_rq->min_vruntime_copy;
+			smp_rmb();
+			min_vruntime = cfs_rq->min_vruntime;
+		} while (min_vruntime != min_vruntime_copy);
+#else
+		min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+		se->vruntime -= min_vruntime;
 	}
+#endif /* CONFIG_CACULE_SCHED */
 
 	if (!task_on_rq_migrating(p)) {
-		remove_entity_load_avg(se);
 
 		/*
-		 * Here, the task's PELT values have been updated according to
-		 * the current rq's clock. But if that clock hasn't been
-		 * updated in a while, a substantial idle time will be missed,
-		 * leading to an inflation after wake-up on the new rq.
-		 *
-		 * Estimate the missing time from the cfs_rq last_update_time
-		 * and update sched_avg to improve the PELT continuity after
-		 * migration.
+		 * We are supposed to update the task to "current" time, then
+		 * its up to date and ready to go to new CPU/cfs_rq. But we
+		 * have difficulty in getting what current time is, so simply
+		 * throw away the out-of-date time. This will result in the
+		 * wakee task is less decayed, but giving the wakee more load
+		 * sounds not bad.
 		 */
-		migrate_se_pelt_lag(se);
+		remove_entity_load_avg(&p->se);
 	}
 
 	/* Tell new CPU we are migrated */
-	se->avg.last_update_time = 0;
+	p->se.avg.last_update_time = 0;
 
 	/* We have migrated, no longer consider this task hot */
-	se->exec_start = 0;
+	p->se.exec_start = 0;
 
 	update_scan_period(p, new_cpu);
 }
@@ -7837,6 +8168,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 }
 #endif /* CONFIG_SMP */
 
+#if !defined(CONFIG_CACULE_SCHED)
 static unsigned long wakeup_gran(struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -7913,6 +8245,7 @@ static void set_skip_buddy(struct sched_entity *se)
 	for_each_sched_entity(se)
 		cfs_rq_of(se)->skip = se;
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 /*
  * Preempt the current task with a newly woken task if needed:
@@ -7921,9 +8254,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
+
+#if !defined(CONFIG_CACULE_SCHED)
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;
+#endif /* CONFIG_CACULE_SCHED */
 	int cse_is_idle, pse_is_idle;
 
 	if (unlikely(se == pse))
@@ -7938,10 +8274,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
 		return;
 
+#if !defined(CONFIG_CACULE_SCHED)
 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 		set_next_buddy(pse);
 		next_buddy_marked = 1;
 	}
+#endif /* CONFIG_CACULE_SCHED */
 
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -7984,6 +8322,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	update_curr(cfs_rq_of(se));
+#ifdef CONFIG_CACULE_SCHED
+	if (entity_before(sched_clock(), &se->cacule_node, &pse->cacule_node) == 1)
+		goto preempt;
+#else
 	if (wakeup_preempt_entity(se, pse) == 1) {
 		/*
 		 * Bias pick_next to pick the sched entity that is
@@ -7993,11 +8335,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 			set_next_buddy(pse);
 		goto preempt;
 	}
+#endif /* CONFIG_CACULE_SCHED */
 
 	return;
 
 preempt:
 	resched_curr(rq);
+
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
@@ -8012,6 +8357,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 
 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
 		set_last_buddy(se);
+#endif /* CONFIG_CACULE_SCHED */
 }
 
 #ifdef CONFIG_SMP
@@ -8043,6 +8389,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
+#ifdef CONFIG_CACULE_RDB
+	update_IS(rq);
+#endif
+
 	return task_of(se);
 }
 #endif
@@ -8106,6 +8456,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
+	/*
+	 * Here we picked a sched_entity starting from
+	 * the same group of curr, but the task could
+	 * be a child of the selected sched_entity.
+	 */
 	p = task_of(se);
 
 	/*
@@ -8116,6 +8471,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	if (prev != p) {
 		struct sched_entity *pse = &prev->se;
 
+		/* while se and pse are not in the same group */
 		while (!(cfs_rq = is_same_group(se, pse))) {
 			int se_depth = se->depth;
 			int pse_depth = pse->depth;
@@ -8130,6 +8486,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 			}
 		}
 
+		/* Here we reached the point were both
+		 * sched_entities are in the same group.
+		 */
 		put_prev_entity(cfs_rq, pse);
 		set_next_entity(cfs_rq, se);
 	}
@@ -8140,6 +8499,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	if (prev)
 		put_prev_task(rq, prev);
 
+	/* Going down the hierarchy */
 	do {
 		se = pick_next_entity(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
@@ -8149,6 +8509,15 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	p = task_of(se);
 
 done: __maybe_unused;
+#ifdef CONFIG_CACULE_SCHED
+	if (prev)
+		prev->se.cacule_node.vruntime &= YIELD_UNMARK;
+
+#ifdef CONFIG_CACULE_RDB
+	update_IS(rq);
+#endif
+
+#endif
 #ifdef CONFIG_SMP
 	/*
 	 * Move the next running task to the front of
@@ -8166,6 +8535,11 @@ done: __maybe_unused;
 	return p;
 
 idle:
+#ifdef CONFIG_CACULE_RDB
+	WRITE_ONCE(rq->max_IS_score, ~0);
+	WRITE_ONCE(rq->to_migrate_task, NULL);
+#endif
+
 	if (!rf)
 		return NULL;
 
@@ -8219,7 +8593,15 @@ static void yield_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+
+#ifdef CONFIG_CACULE_SCHED
+	struct cacule_node *cn = &curr->se.cacule_node;
+
+	if (cacule_yield)
+		cn->vruntime |= YIELD_MARK;
+#else
 	struct sched_entity *se = &curr->se;
+#endif
 
 	/*
 	 * Are we the only task in the tree?
@@ -8227,7 +8609,9 @@ static void yield_task_fair(struct rq *rq)
 	if (unlikely(rq->nr_running == 1))
 		return;
 
+#if !defined(CONFIG_CACULE_SCHED)
 	clear_buddies(cfs_rq, se);
+#endif
 
 	if (curr->policy != SCHED_BATCH) {
 		update_rq_clock(rq);
@@ -8243,7 +8627,9 @@ static void yield_task_fair(struct rq *rq)
 		rq_clock_skip_update(rq);
 	}
 
+#if !defined(CONFIG_CACULE_SCHED)
 	set_skip_buddy(se);
+#endif
 }
 
 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -8254,8 +8640,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/* Tell the scheduler that we'd really like pse to run next. */
 	set_next_buddy(se);
+#endif
 
 	yield_task_fair(rq);
 
@@ -8463,6 +8851,7 @@ struct lb_env {
 	struct list_head	tasks;
 };
 
+#if !defined(CONFIG_CACULE_RDB)
 /*
  * Is this task likely cache-hot:
  */
@@ -8482,6 +8871,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
 		return 0;
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/*
 	 * Buddy candidates are cache hot:
 	 */
@@ -8489,6 +8879,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 			(&p->se == cfs_rq_of(&p->se)->next ||
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
+#endif
 
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
@@ -8892,6 +9283,7 @@ static void attach_tasks(struct lb_env *env)
 
 	rq_unlock(env->dst_rq, &rf);
 }
+#endif
 
 #ifdef CONFIG_NO_HZ_COMMON
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
@@ -8941,6 +9333,7 @@ static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
 #endif
 
+#if !defined(CONFIG_CACULE_RDB)
 static bool __update_blocked_others(struct rq *rq, bool *done)
 {
 	const struct sched_class *curr_class;
@@ -8966,9 +9359,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
 
 	return decayed;
 }
+#endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+#if !defined(CONFIG_CACULE_RDB)
 static bool __update_blocked_fair(struct rq *rq, bool *done)
 {
 	struct cfs_rq *cfs_rq, *pos;
@@ -8985,9 +9380,6 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
 		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
 			update_tg_load_avg(cfs_rq);
 
-			if (cfs_rq->nr_running == 0)
-				update_idle_cfs_rq_clock_pelt(cfs_rq);
-
 			if (cfs_rq == &rq->cfs)
 				decayed = true;
 		}
@@ -9011,6 +9403,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
 
 	return decayed;
 }
+#endif
 
 /*
  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
@@ -9077,6 +9470,7 @@ static unsigned long task_h_load(struct task_struct *p)
 }
 #endif
 
+#if !defined(CONFIG_CACULE_RDB)
 static void update_blocked_averages(int cpu)
 {
 	bool decayed = false, done = true;
@@ -9095,6 +9489,7 @@ static void update_blocked_averages(int cpu)
 		cpufreq_update_util(rq, 0);
 	rq_unlock_irqrestore(rq, &rf);
 }
+#endif
 
 /********** Helpers for find_busiest_group ************************/
 
@@ -10131,77 +10526,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	return idlest;
 }
 
-static void update_idle_cpu_scan(struct lb_env *env,
-				 unsigned long sum_util)
-{
-	struct sched_domain_shared *sd_share;
-	int llc_weight, pct;
-	u64 x, y, tmp;
-	/*
-	 * Update the number of CPUs to scan in LLC domain, which could
-	 * be used as a hint in select_idle_cpu(). The update of sd_share
-	 * could be expensive because it is within a shared cache line.
-	 * So the write of this hint only occurs during periodic load
-	 * balancing, rather than CPU_NEWLY_IDLE, because the latter
-	 * can fire way more frequently than the former.
-	 */
-	if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
-		return;
-
-	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
-	if (env->sd->span_weight != llc_weight)
-		return;
-
-	sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
-	if (!sd_share)
-		return;
-
-	/*
-	 * The number of CPUs to search drops as sum_util increases, when
-	 * sum_util hits 85% or above, the scan stops.
-	 * The reason to choose 85% as the threshold is because this is the
-	 * imbalance_pct(117) when a LLC sched group is overloaded.
-	 *
-	 * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
-	 * and y'= y / SCHED_CAPACITY_SCALE
-	 *
-	 * x is the ratio of sum_util compared to the CPU capacity:
-	 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
-	 * y' is the ratio of CPUs to be scanned in the LLC domain,
-	 * and the number of CPUs to scan is calculated by:
-	 *
-	 * nr_scan = llc_weight * y'                                    [2]
-	 *
-	 * When x hits the threshold of overloaded, AKA, when
-	 * x = 100 / pct, y drops to 0. According to [1],
-	 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
-	 *
-	 * Scale x by SCHED_CAPACITY_SCALE:
-	 * x' = sum_util / llc_weight;                                  [3]
-	 *
-	 * and finally [1] becomes:
-	 * y = SCHED_CAPACITY_SCALE -
-	 *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
-	 *
-	 */
-	/* equation [3] */
-	x = sum_util;
-	do_div(x, llc_weight);
-
-	/* equation [4] */
-	pct = env->sd->imbalance_pct;
-	tmp = x * x * pct * pct;
-	do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
-	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
-	y = SCHED_CAPACITY_SCALE - tmp;
-
-	/* equation [2] */
-	y *= llc_weight;
-	do_div(y, SCHED_CAPACITY_SCALE);
-	if ((int)y != sd_share->nr_idle_scan)
-		WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
-}
-
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10214,7 +10538,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
-	unsigned long sum_util = 0;
 	int sg_status = 0;
 
 	do {
@@ -10247,7 +10570,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
 
-		sum_util += sgs->group_util;
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
@@ -10273,8 +10595,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
 		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
 	}
-
-	update_idle_cpu_scan(env, sum_util);
 }
 
 /**
@@ -10460,6 +10780,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  *            different in groups.
  */
 
+#if !defined(CONFIG_CACULE_RDB)
 /**
  * find_busiest_group - Returns the busiest group within the sched_domain
  * if there is an imbalance.
@@ -10733,6 +11054,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 
 	return busiest;
 }
+#endif
 
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
@@ -10769,6 +11091,7 @@ imbalanced_active_balance(struct lb_env *env)
 	return 0;
 }
 
+#if !defined(CONFIG_CACULE_RDB)
 static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
@@ -11107,6 +11430,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 out:
 	return ld_moved;
 }
+#endif
 
 static inline unsigned long
 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
@@ -11145,6 +11469,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 		*next_balance = next;
 }
 
+#if !defined(CONFIG_CACULE_RDB)
 /*
  * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
  * running tasks off the busiest CPU onto idle CPUs. It requires at
@@ -11230,6 +11555,7 @@ static int active_load_balance_cpu_stop(void *data)
 }
 
 static DEFINE_SPINLOCK(balancing);
+#endif
 
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
@@ -11264,6 +11590,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
 	return false;
 }
 
+#if !defined(CONFIG_CACULE_RDB)
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -11351,6 +11678,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 		rq->next_balance = next_balance;
 
 }
+#endif
 
 static inline int on_null_domain(struct rq *rq)
 {
@@ -11386,6 +11714,7 @@ static inline int find_new_ilb(void)
 	return nr_cpu_ids;
 }
 
+#if !defined(CONFIG_CACULE_RDB)
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
  * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
@@ -11539,6 +11868,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (flags)
 		kick_ilb(flags);
 }
+#endif /* CONFIG_CACULE_RDB */
 
 static void set_cpu_sd_state_busy(int cpu)
 {
@@ -11660,11 +11990,17 @@ static bool update_nohz_stats(struct rq *rq)
 	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
 		return true;
 
+#if !defined(CONFIG_CACULE_RDB)
 	update_blocked_averages(cpu);
+#endif
 
 	return rq->has_blocked_load;
 }
 
+#ifdef CONFIG_CACULE_RDB
+static int idle_try_pull_any(struct cfs_rq *cfs_rq);
+#endif
+
 /*
  * Internal function that runs load balance for all idle cpus. The load balance
  * can be a simple update of blocked load or a complete load balance with
@@ -11742,7 +12078,11 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
 			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
+#if !defined(CONFIG_CACULE_RDB)
 				rebalance_domains(rq, CPU_IDLE);
+#else
+				idle_try_pull_any(&rq->cfs);
+#endif
 		}
 
 		if (time_after(next_balance, rq->next_balance)) {
@@ -11769,6 +12109,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
 		WRITE_ONCE(nohz.has_blocked, 1);
 }
 
+#if !defined(CONFIG_CACULE_RDB)
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
@@ -11789,6 +12130,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 	return true;
 }
+#endif
 
 /*
  * Check if we need to run the ILB for updating blocked load before entering
@@ -11819,9 +12161,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
 	if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
 		return;
 
+#if !defined(CONFIG_CACULE_SCHED)
 	/* Will wake up very soon. No time for doing anything else*/
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 		return;
+#endif
 
 	/* Don't need to update blocked load of idle CPUs*/
 	if (!READ_ONCE(nohz.has_blocked) ||
@@ -11836,16 +12180,150 @@ static void nohz_newidle_balance(struct rq *this_rq)
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
+#if !defined(CONFIG_CACULE_RDB)
 static inline void nohz_balancer_kick(struct rq *rq) { }
 
 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
 	return false;
 }
+#endif
 
 static inline void nohz_newidle_balance(struct rq *this_rq) { }
+
 #endif /* CONFIG_NO_HZ_COMMON */
 
+#ifdef CONFIG_CACULE_RDB
+static int
+can_migrate_task(struct task_struct *p, int dst_cpu, struct rq *src_rq)
+{
+	if (task_running(src_rq, p))
+		return 0;
+
+	/* Disregard pcpu kthreads; they are where they need to be. */
+	if (kthread_is_per_cpu(p))
+		return 0;
+
+	if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr))
+		return 0;
+
+	if (p->se.exec_start == 0)
+		return 0;
+
+	/*
+	 * Don't migrate task if the task's cookie does not match
+	 * with the destination CPU's core cookie.
+	 */
+	if (!sched_core_cookie_match(cpu_rq(dst_cpu), p))
+		return 0;
+
+	return 1;
+}
+
+static void push_to_unlock(struct rq *this_rq,
+			   struct rq *dst_rq,
+			   struct task_struct *p,
+			   int dst_cpu)
+{
+	struct rq_flags rf;
+
+	// detach task
+	deactivate_task(this_rq, p, DEQUEUE_NOCLOCK);
+	set_task_cpu(p, dst_cpu);
+
+	// unlock this rq
+	raw_spin_unlock(&this_rq->__lock);
+
+	/* push to */
+	rq_lock_irqsave(dst_rq, &rf);
+	update_rq_clock(dst_rq);
+
+	activate_task(dst_rq, p, ENQUEUE_NOCLOCK);
+	check_preempt_curr(dst_rq, p, 0);
+
+	// unlock src rq
+	rq_unlock(dst_rq, &rf);
+	local_irq_restore(rf.flags);
+}
+
+static void pull_from_unlock(struct rq *this_rq,
+			     struct rq *src_rq,
+			     struct rq_flags *rf,
+			     struct task_struct *p,
+			     int dst_cpu)
+{
+	// detach task
+	deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+	set_task_cpu(p, dst_cpu);
+
+	// unlock src rq
+	rq_unlock(src_rq, rf);
+	local_irq_restore(rf->flags);
+
+	// lock this rq
+	raw_spin_lock(&this_rq->__lock);
+	update_rq_clock(this_rq);
+
+	activate_task(this_rq, p, ENQUEUE_NOCLOCK);
+	check_preempt_curr(this_rq, p, 0);
+
+	// unlock this rq
+	raw_spin_unlock(&this_rq->__lock);
+}
+
+static inline struct rq *
+find_max_IS_rq(struct rq *this_rq, int dst_cpu)
+{
+	struct rq *tmp_rq, *max_rq = NULL;
+	int cpu;
+	unsigned int max_IS = this_rq->max_IS_score;
+	unsigned int local_IS;
+
+	// find max hrrn
+	for_each_online_cpu(cpu) {
+		if (cpu == dst_cpu)
+			continue;
+
+		tmp_rq = cpu_rq(cpu);
+
+		if (tmp_rq->nr_running < 2 || !(READ_ONCE(tmp_rq->to_migrate_task)))
+			continue;
+
+		local_IS = READ_ONCE(tmp_rq->max_IS_score);
+
+		if (local_IS < max_IS) {
+			max_IS = local_IS;
+			max_rq = tmp_rq;
+		}
+	}
+
+	return max_rq;
+}
+
+static int try_pull_from(struct rq *src_rq, struct rq *this_rq)
+{
+	struct rq_flags rf;
+	int dst_cpu = cpu_of(this_rq);
+	struct task_struct *p;
+
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (src_rq->to_migrate_task && src_rq->nr_running > 1) {
+		p = src_rq->to_migrate_task;
+
+		if (can_migrate_task(p, dst_cpu, src_rq)) {
+			pull_from_unlock(this_rq, src_rq, &rf, p, dst_cpu);
+			return 1;
+		}
+	}
+
+	rq_unlock(src_rq, &rf);
+	local_irq_restore(rf.flags);
+
+	return 0;
+}
+
 /*
  * newidle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -11856,6 +12334,111 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
  *   > 0 - success, new (fair) tasks present
  */
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+	int this_cpu = this_rq->cpu;
+	struct task_struct *p = NULL;
+	struct rq *src_rq;
+	int src_cpu;
+	struct rq_flags src_rf;
+	int pulled_task = 0;
+	int cores_round = 1;
+
+	update_misfit_status(NULL, this_rq);
+	/*
+	 * We must set idle_stamp _before_ calling idle_balance(), such that we
+	 * measure the duration of idle_balance() as idle time.
+	 */
+	this_rq->idle_stamp = rq_clock(this_rq);
+
+	/*
+	 * Do not pull tasks towards !active CPUs...
+	 */
+	if (!cpu_active(this_cpu))
+		return 0;
+
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	rq_unpin_lock(this_rq, rf);
+	raw_spin_unlock(&this_rq->__lock);
+
+again:
+	for_each_online_cpu(src_cpu) {
+
+		if (src_cpu == this_cpu)
+			continue;
+
+		if (cores_round && !cpus_share_cache(src_cpu, this_cpu))
+			continue;
+
+		src_rq = cpu_rq(src_cpu);
+
+		if (src_rq->nr_running < 2
+		    || !(READ_ONCE(src_rq->to_migrate_task)))
+			continue;
+
+		rq_lock_irqsave(src_rq, &src_rf);
+		update_rq_clock(src_rq);
+
+		if (src_rq->nr_running < 2 || !(src_rq->to_migrate_task))
+			goto next;
+
+		p = src_rq->to_migrate_task;
+
+		if (can_migrate_task(p, this_cpu, src_rq)) {
+			pull_from_unlock(this_rq, src_rq, &src_rf, p, this_cpu);
+
+			pulled_task = 1;
+			goto out;
+		}
+
+next:
+		rq_unlock(src_rq, &src_rf);
+		local_irq_restore(src_rf.flags);
+
+		/*
+		 * Stop searching for tasks to pull if there are
+		 * now runnable tasks on this rq.
+		 */
+		if (pulled_task || this_rq->nr_running > 0)
+			goto out;
+	}
+
+	if (cores_round) {
+		// now search for all cpus
+		cores_round = 0;
+		goto again;
+	}
+
+out:
+	raw_spin_lock(&this_rq->__lock);
+
+	/*
+	 * While browsing the domains, we released the rq lock, a task could
+	 * have been enqueued in the meantime. Since we're not going idle,
+	 * pretend we pulled a task.
+	 */
+	if (this_rq->cfs.h_nr_running && !pulled_task)
+		pulled_task = 1;
+
+	/* Is there a task of a high priority class? */
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+		pulled_task = -1;
+
+	if (pulled_task)
+		this_rq->idle_stamp = 0;
+	else
+		nohz_newidle_balance(this_rq);
+
+	rq_repin_lock(this_rq, rf);
+
+	return pulled_task;
+}
+#else
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
 	unsigned long next_balance = jiffies + HZ;
 	int this_cpu = this_rq->cpu;
@@ -12020,6 +12603,217 @@ void trigger_load_balance(struct rq *rq)
 
 	nohz_balancer_kick(rq);
 }
+#endif
+
+#ifdef CONFIG_CACULE_RDB
+static int
+idle_try_pull_any(struct cfs_rq *cfs_rq)
+{
+	struct task_struct *p = NULL;
+	struct rq *this_rq = rq_of(cfs_rq), *src_rq;
+	int dst_cpu = cpu_of(this_rq);
+	int src_cpu;
+	struct rq_flags rf;
+	int pulled = 0;
+	int cores_round = 1;
+
+again:
+	for_each_online_cpu(src_cpu) {
+
+		if (src_cpu == dst_cpu)
+			continue;
+
+		if (cores_round && !cpus_share_cache(src_cpu, dst_cpu))
+			continue;
+
+		src_rq = cpu_rq(src_cpu);
+
+		if (src_rq->nr_running < 2
+		    || !(READ_ONCE(src_rq->to_migrate_task)))
+			continue;
+
+		rq_lock_irqsave(src_rq, &rf);
+		update_rq_clock(src_rq);
+
+		if (src_rq->nr_running < 2 || !(src_rq->to_migrate_task))
+			goto next;
+
+		p = src_rq->to_migrate_task;
+
+		if (can_migrate_task(p, dst_cpu, src_rq)) {
+			pull_from_unlock(this_rq, src_rq, &rf, p, dst_cpu);
+			pulled = 1;
+			goto out;
+		}
+
+next:
+		rq_unlock(src_rq, &rf);
+		local_irq_restore(rf.flags);
+	}
+
+	if (cores_round) {
+		// now search for all cpus
+		cores_round = 0;
+		goto again;
+	}
+
+out:
+	return pulled;
+}
+
+
+static int
+try_pull_higher_IS(struct rq *this_rq)
+{
+	struct rq *max_rq;
+	int dst_cpu = cpu_of(this_rq);
+
+	max_rq = find_max_IS_rq(this_rq, dst_cpu);
+
+	if (!max_rq)
+		return 0;
+
+	if (try_pull_from(max_rq, this_rq))
+		return 1;
+
+	return 0;
+}
+
+static void try_push_any(struct rq *this_rq)
+{
+	struct task_struct *p = NULL;
+	struct rq *dst_rq;
+	int dst_cpu;
+	int src_cpu = cpu_of(this_rq);
+	int cores_round = 1;
+
+again:
+	for_each_online_cpu(dst_cpu) {
+
+		if (dst_cpu == src_cpu)
+			continue;
+
+		if (cores_round && !cpus_share_cache(src_cpu, dst_cpu))
+			continue;
+
+		dst_rq = cpu_rq(dst_cpu);
+
+		if (dst_rq->nr_running >= this_rq->nr_running - 1)
+			continue;
+
+		// lock this rq
+		raw_spin_lock(&this_rq->__lock);
+		update_rq_clock(this_rq);
+
+		if (!this_rq->to_migrate_task) {
+			// unlock this rq
+			raw_spin_unlock(&this_rq->__lock);
+			return;
+		}
+
+		p = this_rq->to_migrate_task;
+
+		if (can_migrate_task(p, dst_cpu, this_rq)) {
+			push_to_unlock(this_rq, dst_rq, p, dst_cpu);
+			return;
+		}
+
+		// unlock this rq
+		raw_spin_unlock(&this_rq->__lock);
+	}
+
+	if (cores_round) {
+		// now search for all cpus
+		cores_round = 0;
+		goto again;
+	}
+}
+
+static void try_pull_any(struct rq *this_rq)
+{
+	struct task_struct *p = NULL;
+	struct rq *src_rq;
+	int dst_cpu = cpu_of(this_rq);
+	int src_cpu;
+	struct rq_flags src_rf;
+	int cores_round = 1;
+	unsigned int this_max_IS = this_rq->max_IS_score;
+
+again:
+	for_each_online_cpu(src_cpu) {
+
+		if (src_cpu == dst_cpu)
+			continue;
+
+		if (cores_round && !cpus_share_cache(src_cpu, dst_cpu))
+			continue;
+
+		src_rq = cpu_rq(src_cpu);
+
+		p = READ_ONCE(src_rq->to_migrate_task);
+		if (src_rq->nr_running < 2 || !p
+                    || READ_ONCE(src_rq->max_IS_score) >= this_max_IS)
+			continue;
+
+		rq_lock_irqsave(src_rq, &src_rf);
+		update_rq_clock(src_rq);
+
+		if (src_rq->nr_running < 2 || !(src_rq->to_migrate_task)
+                    || src_rq->max_IS_score >= this_max_IS)
+			goto next;
+
+		p = src_rq->to_migrate_task;
+
+		if (can_migrate_task(p, dst_cpu, src_rq)) {
+			pull_from_unlock(this_rq, src_rq, &src_rf, p, dst_cpu);
+			return;
+		}
+
+next:
+		rq_unlock(src_rq, &src_rf);
+		local_irq_restore(src_rf.flags);
+	}
+
+	if (cores_round) {
+		// now search for all cpus
+		cores_round = 0;
+		goto again;
+	}
+}
+
+static inline void
+active_balance(struct rq *rq)
+{
+	if (rq->nr_running < 2)
+		try_pull_higher_IS(rq);
+	else {
+		try_push_any(rq);
+		try_pull_any(rq);
+	}
+}
+
+void trigger_load_balance(struct rq *rq)
+{
+	unsigned long interval;
+
+#ifdef CONFIG_RDB_INTERVAL
+	if (time_before(jiffies, rq->next_balance))
+		return;
+#endif
+
+	if (rq->idle_balance)
+		idle_try_pull_any(&rq->cfs);
+	else {
+		active_balance(rq);
+
+#ifdef CONFIG_RDB_INTERVAL
+		/* scale ms to jiffies */
+		interval = msecs_to_jiffies(CONFIG_RDB_INTERVAL);
+		rq->next_balance = jiffies + interval;
+#endif
+	}
+}
+#endif
 
 static void rq_online_fair(struct rq *rq)
 {
@@ -12087,7 +12881,9 @@ static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forc
 			cfs_rq->forceidle_seq = fi_seq;
 		}
 
+#if !defined(CONFIG_CACULE_SCHED)
 		cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+#endif
 	}
 }
 
@@ -12142,8 +12938,12 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
 	 * min_vruntime_fi, which would have been updated in prior calls
 	 * to se_fi_update().
 	 */
+#ifdef CONFIG_CACULE_SCHED
+	delta = (s64)(sea->vruntime - seb->vruntime);
+#else
 	delta = (s64)(sea->vruntime - seb->vruntime) +
 		(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+#endif
 
 	return delta > 0;
 }
@@ -12169,6 +12969,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
+#ifdef CONFIG_CACULE_RDB
+	update_IS(rq);
+#endif
+
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
 
@@ -12178,11 +12982,28 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	task_tick_core(rq, curr);
 }
 
+#ifdef CONFIG_CACULE_SCHED
 /*
  * called on fork with the child task as argument from the parent's context
  *  - child not yet on the tasklist
  *  - preemption disabled
  */
+ static void task_fork_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = this_rq();
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+	update_rq_clock(rq);
+
+	cfs_rq = task_cfs_rq(current);
+	if (cfs_rq->curr)
+		update_curr(cfs_rq);
+
+	rq_unlock(rq, &rf);
+}
+#else
 static void task_fork_fair(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
@@ -12213,6 +13034,7 @@ static void task_fork_fair(struct task_struct *p)
 	se->vruntime -= cfs_rq->min_vruntime;
 	rq_unlock(rq, &rf);
 }
+#endif /* CONFIG_CACULE_SCHED */
 
 /*
  * Priority of the task has changed. Check to see if we preempt
@@ -12337,6 +13159,8 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 static void detach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
+
+#if !defined(CONFIG_CACULE_SCHED)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	if (!vruntime_normalized(p)) {
@@ -12347,6 +13171,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
 		place_entity(cfs_rq, se, 0);
 		se->vruntime -= cfs_rq->min_vruntime;
 	}
+#endif
 
 	detach_entity_cfs_rq(se);
 }
@@ -12354,12 +13179,17 @@ static void detach_task_cfs_rq(struct task_struct *p)
 static void attach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
+
+#if !defined(CONFIG_CACULE_SCHED)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+#endif
 
 	attach_entity_cfs_rq(se);
 
+#if !defined(CONFIG_CACULE_SCHED)
 	if (!vruntime_normalized(p))
 		se->vruntime += cfs_rq->min_vruntime;
+#endif
 }
 
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12415,10 +13245,21 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
-	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+
+#if !defined(CONFIG_CACULE_SCHED)
+	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+#endif /* CONFIG_CACULE_SCHED */
+
 #ifdef CONFIG_SMP
 	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
+
+#ifdef CONFIG_CACULE_SCHED
+	cfs_rq->head = NULL;
+#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12823,7 +13664,6 @@ __init void init_sched_fair_class(void)
 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
 	}
-
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..1cf435bbcd9c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -60,8 +60,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
 /*
  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
  */
-SCHED_FEAT(SIS_PROP, false)
-SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SIS_PROP, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 036b0e2cd2b4..0f310768260c 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -467,63 +467,3 @@ int update_irq_load_avg(struct rq *rq, u64 running)
 	return ret;
 }
 #endif
-
-__read_mostly unsigned int sched_pelt_lshift;
-
-#ifdef CONFIG_SYSCTL
-static unsigned int sysctl_sched_pelt_multiplier = 1;
-
-int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
-			  size_t *lenp, loff_t *ppos)
-{
-	static DEFINE_MUTEX(mutex);
-	unsigned int old;
-	int ret;
-
-	mutex_lock(&mutex);
-	old = sysctl_sched_pelt_multiplier;
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (ret)
-		goto undo;
-	if (!write)
-		goto done;
-
-	switch (sysctl_sched_pelt_multiplier)  {
-	case 1:
-		fallthrough;
-	case 2:
-		fallthrough;
-	case 4:
-		WRITE_ONCE(sched_pelt_lshift,
-			   sysctl_sched_pelt_multiplier >> 1);
-		goto done;
-	default:
-		ret = -EINVAL;
-	}
-
-undo:
-	sysctl_sched_pelt_multiplier = old;
-done:
-	mutex_unlock(&mutex);
-
-	return ret;
-}
-
-static struct ctl_table sched_pelt_sysctls[] = {
-	{
-		.procname       = "sched_pelt_multiplier",
-		.data           = &sysctl_sched_pelt_multiplier,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = sched_pelt_multiplier,
-	},
-	{}
-};
-
-static int __init sched_pelt_sysctl_init(void)
-{
-	register_sysctl_init("kernel", sched_pelt_sysctls);
-	return 0;
-}
-late_initcall(sched_pelt_sysctl_init);
-#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9b35b5072bae..4ff2ed4f8fa1 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,33 +61,6 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 }
 
-static inline u64 rq_clock_task_mult(struct rq *rq)
-{
-	lockdep_assert_rq_held(rq);
-	assert_clock_updated(rq);
-
-	return rq->clock_task_mult;
-}
-
-static inline u64 rq_clock_pelt(struct rq *rq)
-{
-	lockdep_assert_rq_held(rq);
-	assert_clock_updated(rq);
-
-	return rq->clock_pelt - rq->lost_idle_time;
-}
-
-/* The rq is idle, we can sync to clock_task */
-static inline void _update_idle_rq_clock_pelt(struct rq *rq)
-{
-	rq->clock_pelt = rq_clock_task_mult(rq);
-
-	u64_u32_store(rq->clock_idle, rq_clock(rq));
-	/* Paired with smp_rmb in migrate_se_pelt_lag() */
-	smp_wmb();
-	u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
-}
-
 /*
  * The clock_pelt scales the time to reflect the effective amount of
  * computation done during the running delta time but then sync back to
@@ -103,7 +76,8 @@ static inline void _update_idle_rq_clock_pelt(struct rq *rq)
 static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 {
 	if (unlikely(is_idle_task(rq->curr))) {
-		_update_idle_rq_clock_pelt(rq);
+		/* The rq is idle, we can sync to clock_task */
+		rq->clock_pelt  = rq_clock_task(rq);
 		return;
 	}
 
@@ -129,27 +103,6 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 	rq->clock_pelt += delta;
 }
 
-extern unsigned int sched_pelt_lshift;
-
-/*
- * absolute time   |1      |2      |3      |4      |5      |6      |
- * @ mult = 1      --------****************--------****************-
- * @ mult = 2      --------********----------------********---------
- * @ mult = 4      --------****--------------------****-------------
- * clock task mult
- * @ mult = 2      |   |   |2  |3  |   |   |   |   |5  |6  |   |   |
- * @ mult = 4      | | | | |2|3| | | | | | | | | | |5|6| | | | | | |
- *
- */
-static inline void update_rq_clock_task_mult(struct rq *rq, s64 delta)
-{
-	delta <<= READ_ONCE(sched_pelt_lshift);
-
-	rq->clock_task_mult += delta;
-
-	update_rq_clock_pelt(rq, delta);
-}
-
 /*
  * When rq becomes idle, we have to check if it has lost idle time
  * because it was fully busy. A rq is fully used when the /Sum util_sum
@@ -176,24 +129,18 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
 	 * rq's clock_task.
 	 */
 	if (util_sum >= divider)
-		rq->lost_idle_time += rq_clock_task_mult(rq) - rq->clock_pelt;
-
-	_update_idle_rq_clock_pelt(rq);
+		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
 }
 
-#ifdef CONFIG_CFS_BANDWIDTH
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+static inline u64 rq_clock_pelt(struct rq *rq)
 {
-	u64 throttled;
-
-	if (unlikely(cfs_rq->throttle_count))
-		throttled = U64_MAX;
-	else
-		throttled = cfs_rq->throttled_clock_pelt_time;
+	lockdep_assert_rq_held(rq);
+	assert_clock_updated(rq);
 
-	u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
+	return rq->clock_pelt - rq->lost_idle_time;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
@@ -203,7 +150,6 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
 }
 #else
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	return rq_clock_pelt(rq_of(cfs_rq));
@@ -247,23 +193,17 @@ update_irq_load_avg(struct rq *rq, u64 running)
 	return 0;
 }
 
-static inline u64 rq_clock_task_mult(struct rq *rq)
-{
-	return rq_clock_task(rq);
-}
-
 static inline u64 rq_clock_pelt(struct rq *rq)
 {
-	return rq_clock_task_mult(rq);
+	return rq_clock_task(rq);
 }
 
 static inline void
-update_rq_clock_task_mult(struct rq *rq, s64 delta) { }
+update_rq_clock_pelt(struct rq *rq, s64 delta) { }
 
 static inline void
 update_idle_rq_clock_pelt(struct rq *rq) { }
 
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
 #endif
 
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2c89aaa9200f..a45f4e7439c3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -507,45 +507,6 @@ extern void unregister_rt_sched_group(struct task_group *tg);
 extern void free_rt_sched_group(struct task_group *tg);
 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
 
-/*
- * u64_u32_load/u64_u32_store
- *
- * Use a copy of a u64 value to protect against data race. This is only
- * applicable for 32-bits architectures.
- */
-#ifdef CONFIG_64BIT
-# define u64_u32_load_copy(var, copy)       var
-# define u64_u32_store_copy(var, copy, val) (var = val)
-#else
-# define u64_u32_load_copy(var, copy)					\
-({									\
-	u64 __val, __val_copy;						\
-	do {								\
-		__val_copy = copy;					\
-		/*							\
-		 * paired with u64_u32_store_copy(), ordering access	\
-		 * to var and copy.					\
-		 */							\
-		smp_rmb();						\
-		__val = var;						\
-	} while (__val != __val_copy);					\
-	__val;								\
-})
-# define u64_u32_store_copy(var, copy, val)				\
-do {									\
-	typeof(val) __val = (val);					\
-	var = __val;							\
-	/*								\
-	 * paired with u64_u32_load_copy(), ordering access to var and	\
-	 * copy.							\
-	 */								\
-	smp_wmb();							\
-	copy = __val;							\
-} while (0)
-#endif
-# define u64_u32_load(var)      u64_u32_load_copy(var, var##_copy)
-# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
-
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight	load;
@@ -555,6 +516,12 @@ struct cfs_rq {
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
 	u64			exec_clock;
+
+#ifdef CONFIG_CACULE_SCHED
+#ifdef CONFIG_SCHED_CORE
+	unsigned int		forceidle_seq;
+#endif
+#else
 	u64			min_vruntime;
 #ifdef CONFIG_SCHED_CORE
 	unsigned int		forceidle_seq;
@@ -564,6 +531,7 @@ struct cfs_rq {
 #ifndef CONFIG_64BIT
 	u64			min_vruntime_copy;
 #endif
+#endif /* CONFIG_CACULE_SCHED */
 
 	struct rb_root_cached	tasks_timeline;
 
@@ -572,9 +540,13 @@ struct cfs_rq {
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity	*curr;
+#ifdef CONFIG_CACULE_SCHED
+	struct cacule_node	*head;
+#else
 	struct sched_entity	*next;
 	struct sched_entity	*last;
 	struct sched_entity	*skip;
+#endif // CONFIG_CACULE_SCHED
 
 #ifdef	CONFIG_SCHED_DEBUG
 	unsigned int		nr_spread_over;
@@ -586,7 +558,7 @@ struct cfs_rq {
 	 */
 	struct sched_avg	avg;
 #ifndef CONFIG_64BIT
-	u64			last_update_time_copy;
+	u64			load_last_update_time_copy;
 #endif
 	struct {
 		raw_spinlock_t	lock ____cacheline_aligned;
@@ -635,10 +607,6 @@ struct cfs_rq {
 	int			runtime_enabled;
 	s64			runtime_remaining;
 
-	u64			throttled_pelt_idle;
-#ifndef CONFIG_64BIT
-	u64                     throttled_pelt_idle_copy;
-#endif
 	u64			throttled_clock;
 	u64			throttled_clock_pelt;
 	u64			throttled_clock_pelt_time;
@@ -991,6 +959,11 @@ struct rq {
 	struct rt_rq		rt;
 	struct dl_rq		dl;
 
+#ifdef CONFIG_CACULE_RDB
+	unsigned int		max_IS_score;
+	struct task_struct	*to_migrate_task;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
 	struct list_head	leaf_cfs_rq_list;
@@ -1015,15 +988,8 @@ struct rq {
 	u64			clock;
 	/* Ensure that all clocks are in the same cache line */
 	u64			clock_task ____cacheline_aligned;
-	u64			clock_task_mult;
 	u64			clock_pelt;
 	unsigned long		lost_idle_time;
-	u64			clock_pelt_idle;
-	u64			clock_idle;
-#ifndef CONFIG_64BIT
-	u64			clock_pelt_idle_copy;
-	u64			clock_idle_copy;
-#endif
 
 	atomic_t		nr_iowait;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c7c7c953876..f17155143eac 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1636,6 +1636,59 @@ int proc_do_static_key(struct ctl_table *table, int write,
 }
 
 static struct ctl_table kern_table[] = {
+#ifdef CONFIG_CACULE_SCHED
+	{
+		.procname	= "sched_interactivity_factor",
+		.data		= &interactivity_factor,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_max_lifetime_ms",
+		.data		= &cacule_max_lifetime,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_cache_factor",
+		.data		= &cache_factor,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_cache_divisor",
+		.data		= &cache_divisor,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_starve_factor",
+		.data		= &starve_factor,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_starve_divisor",
+		.data		= &starve_divisor,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_cacule_yield",
+		.data		= &cacule_yield,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
 #ifdef CONFIG_NUMA_BALANCING
 	{
 		.procname	= "numa_balancing",
-- 
2.39.0.rc2.1.gbd5df96b79