From f093ad7cee74b1820f2cb8d76aacc5b4827a8b0d Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Mon, 1 May 2023 10:19:24 +0200
Subject: [PATCH] tt-cachy

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/sched.h  |   24 +-
 include/linux/sysctl.h |    5 +
 init/Kconfig           |    7 +
 kernel/Kconfig.preempt |    1 +
 kernel/reboot.c        |    8 +
 kernel/sched/core.c    |  136 +++-
 kernel/sched/debug.c   |   16 +
 kernel/sched/fair.c    | 1510 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/idle.c    |   13 +-
 kernel/sched/sched.h   |   55 ++
 kernel/sysctl.c        |    5 +
 11 files changed, 1738 insertions(+), 42 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20756534fb47..dbcc3a52160c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -545,6 +545,23 @@ struct sched_statistics {
 #endif /* CONFIG_SCHEDSTATS */
 } ____cacheline_aligned;
 
+#ifdef CONFIG_TT_SCHED
+struct tt_node {
+	struct tt_node*                 next;
+	struct tt_node*                 prev;
+	unsigned int			task_type;
+	u64				vruntime;
+	u64				start_time;
+
+	u64				prev_wait_time;
+	u64				wait_time;
+	u64				prev_burst;
+	u64				curr_burst;
+	u64				burst;
+	unsigned int			rt_sticky;
+};
+#endif
+
 struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
@@ -554,9 +571,14 @@ struct sched_entity {
 
 	u64				exec_start;
 	u64				sum_exec_runtime;
-	u64				vruntime;
 	u64				prev_sum_exec_runtime;
 
+#ifdef CONFIG_TT_SCHED
+	struct tt_node                  tt_node;
+#endif
+
+	u64				vruntime;
+
 	u64				nr_migrations;
 	u64				prev_sleep_sum_runtime;
 	/* average duration of a task */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 780690dc08cd..81645f7d767a 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -53,6 +53,11 @@ struct ctl_dir;
 #define SYSCTL_MAXOLDUID		((void *)&sysctl_vals[10])
 #define SYSCTL_NEG_ONE			((void *)&sysctl_vals[11])
 
+#ifdef CONFIG_TT_SCHED
+#define SYSCTL_NEG_TWENTY		((void *)&sysctl_vals[12])
+#define SYSCTL_NINETEEN			((void *)&sysctl_vals[13])
+#endif
+
 extern const int sysctl_vals[];
 
 #define SYSCTL_LONG_ZERO	((void *)&sysctl_long_vals[0])
diff --git a/init/Kconfig b/init/Kconfig
index 908d045dbe10..f9636ae447bb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -121,6 +121,10 @@ config THREAD_INFO_IN_TASK
 	  One subtle change that will be needed is to use try_get_task_stack()
 	  and put_task_stack() in save_thread_stack_tsk() and get_wchan().
 
+config TT_SCHED
+	bool "TT Scheduler"
+	default y
+
 menu "General setup"
 
 config CACHY
@@ -1018,6 +1022,7 @@ config CGROUP_WRITEBACK
 
 menuconfig CGROUP_SCHED
 	bool "CPU controller"
+	depends on !TT_SCHED
 	default n
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
@@ -1316,6 +1321,8 @@ config CHECKPOINT_RESTORE
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
+	default n
+	depends on !TT_SCHED
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..92d9bbbd78b2 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -118,6 +118,7 @@ config PREEMPT_DYNAMIC
 config SCHED_CORE
 	bool "Core Scheduling for SMT"
 	depends on SCHED_SMT
+	depends on !TT_SCHED
 	help
 	  This option permits Core Scheduling, a means of coordinated task
 	  selection across SMT siblings. When enabled -- see
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3bba88c7ffc6..c670fc1ae3df 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -23,6 +23,10 @@
  * this indicates whether you can reboot with ctrl-alt-del: the default is yes
  */
 
+#ifdef CONFIG_TT_SCHED
+extern int tt_dedicated_cpu_bound_enabled;
+#endif
+
 static int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
@@ -230,6 +234,10 @@ void migrate_to_reboot_cpu(void)
 	/* The boot cpu is always logical cpu 0 */
 	int cpu = reboot_cpu;
 
+#ifdef CONFIG_TT_SCHED
+	tt_dedicated_cpu_bound_enabled = 0;
+#endif
+
 	cpu_hotplug_disable();
 
 	/* Make certain the cpu I'm about to reboot on is online */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d66810d5f207..79d103c0af6e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3031,6 +3031,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	    cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
 		ctx->new_mask = rq->scratch_mask;
 
+#ifdef CONFIG_TT_SCHED
+	if (rq->cfs.dedicated_cpu_bound == &p->se.tt_node)
+		rq->cfs.dedicated_cpu_bound = NULL;
+#endif
+
 	return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
 }
 
@@ -3161,6 +3166,20 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
 	WARN_ON_ONCE(ret);
 }
 
+#ifdef CONFIG_TT_SCHED
+inline void inc_nr_lat_sensitive(unsigned int cpu)
+{
+	if (tt_lat_sens_enabled)
+		per_cpu(nr_lat_sensitive, cpu)++;
+}
+
+inline void dec_nr_lat_sensitive(unsigned int cpu)
+{
+	if (per_cpu(nr_lat_sensitive, cpu))
+		per_cpu(nr_lat_sensitive, cpu) >>= 1;
+}
+#endif
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -3206,6 +3225,12 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
+#ifdef CONFIG_TT_SCHED
+		if (task_is_lat_sensitive(p)) {
+			dec_nr_lat_sensitive(task_cpu(p));
+			inc_nr_lat_sensitive(new_cpu);
+		}
+#endif
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
@@ -4615,6 +4640,78 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_TT_SCHED
+	{
+		.procname	= "sched_tt_balancer_opt",
+		.data		= &tt_balancer_opt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_THREE,
+	},
+	{
+		.procname	= "sched_tt_max_lifetime",
+		.data		= &tt_max_lifetime,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_tt_rt_prio",
+		.data		= &tt_rt_prio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_NEG_TWENTY,
+		.extra2		= SYSCTL_NINETEEN,
+	},
+	{
+		.procname	= "sched_tt_interactive_prio",
+		.data		= &tt_interactive_prio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_NEG_TWENTY,
+		.extra2		= SYSCTL_NINETEEN,
+	},
+	{
+		.procname	= "sched_tt_cpu_bound_prio",
+		.data		= &tt_cpu_bound_prio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_NEG_TWENTY,
+		.extra2		= SYSCTL_NINETEEN,
+	},
+	{
+		.procname	= "sched_tt_batch_prio",
+		.data		= &tt_batch_prio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_NEG_TWENTY,
+		.extra2		= SYSCTL_NINETEEN,
+	},
+	{
+		.procname	= "sched_tt_lat_sens_enabled",
+		.data		= &tt_lat_sens_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_tt_dedicated_cpu_bound_enabled",
+		.data		= &tt_dedicated_cpu_bound_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname       = "sched_schedstats",
@@ -4797,7 +4894,9 @@ void wake_up_new_task(struct task_struct *p)
 {
 	struct rq_flags rf;
 	struct rq *rq;
-
+#ifdef CONFIG_TT_SCHED
+	int target_cpu = 0;
+#endif
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 	WRITE_ONCE(p->__state, TASK_RUNNING);
 #ifdef CONFIG_SMP
@@ -4811,12 +4910,25 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	p->recent_used_cpu = task_cpu(p);
 	rseq_migrate(p);
+#ifdef CONFIG_TT_SCHED
+	target_cpu = select_task_rq(p, task_cpu(p), WF_FORK);
+	__set_task_cpu(p, target_cpu);
+#else
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
+#endif
 #endif
 	rq = __task_rq_lock(p, &rf);
+
+#ifdef CONFIG_TT_SCHED
+	if (task_is_lat_sensitive(p))
+		inc_nr_lat_sensitive(target_cpu);
+#endif
+
 	update_rq_clock(rq);
 	post_init_entity_util_avg(p);
-
+#ifdef CONFIG_TT_SCHED
+	p->se.tt_node.start_time = sched_clock();
+#endif
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -5612,6 +5724,10 @@ void scheduler_tick(void)
 
 	perf_event_task_tick();
 
+#ifdef CONFIG_TT_SCHED
+	dec_nr_lat_sensitive(rq->cpu);
+#endif
+
 #ifdef CONFIG_SMP
 	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq);
@@ -5663,7 +5779,9 @@ static void sched_tick_remote(struct work_struct *work)
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr;
 	struct rq_flags rf;
+#ifndef CONFIG_TT_SCHED
 	u64 delta;
+#endif
 	int os;
 
 	/*
@@ -5682,7 +5800,7 @@ static void sched_tick_remote(struct work_struct *work)
 		goto out_unlock;
 
 	update_rq_clock(rq);
-
+#ifndef CONFIG_TT_SCHED
 	if (!is_idle_task(curr)) {
 		/*
 		 * Make sure the next tick runs within a reasonable
@@ -5691,6 +5809,7 @@ static void sched_tick_remote(struct work_struct *work)
 		delta = rq_clock_task(rq) - curr->se.exec_start;
 		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
 	}
+#endif
 	curr->sched_class->task_tick(rq, curr, 0);
 
 	calc_load_nohz_remote(rq);
@@ -9852,6 +9971,10 @@ LIST_HEAD(task_groups);
 static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
+#ifdef CONFIG_TT_SCHED
+DEFINE_PER_CPU(int, nr_lat_sensitive);
+#endif
+
 void __init sched_init(void)
 {
 	unsigned long ptr = 0;
@@ -9867,6 +9990,10 @@ void __init sched_init(void)
 
 	wait_bit_init();
 
+#ifdef CONFIG_TT_SCHED
+	printk(KERN_INFO "TT CPU scheduler v6.1 by Hamad Al Marri.");
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	ptr += 2 * nr_cpu_ids * sizeof(void **);
 #endif
@@ -9987,6 +10114,9 @@ void __init sched_init(void)
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
+#ifdef CONFIG_TT_SCHED
+		per_cpu(nr_lat_sensitive, i) = 0;
+#endif
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7bdc6cd46c39..8596d7fab7f2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -957,6 +957,22 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 #define P_SCHEDSTAT(F)  __PS(#F, schedstat_val(p->stats.F))
 #define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
 
+
+#ifdef CONFIG_TT_SCHED
+#define PN_TT(F, S) SEQ_printf(m, "%-45s: %20s\n", #F, #S)
+
+	if (p->se.tt_node.task_type == TT_NO_TYPE)
+		PN_TT(task_type, NO_TYPE);
+	else if (p->se.tt_node.task_type == TT_INTERACTIVE)
+		PN_TT(task_type, INTERACTIVE);
+	else if (p->se.tt_node.task_type == TT_REALTIME)
+		PN_TT(task_type, REALTIME);
+	else if (p->se.tt_node.task_type == TT_CPU_BOUND)
+		PN_TT(task_type, CPU_BOUND);
+	else if (p->se.tt_node.task_type == TT_BATCH)
+		PN_TT(task_type, BATCH);
+#endif
+
 	PN(se.exec_start);
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 339c8d9e53b7..420797e58a0a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -115,6 +115,66 @@ unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
  */
 static unsigned int sched_nr_latency = 8;
 
+#ifdef CONFIG_TT_SCHED
+unsigned int __read_mostly tt_balancer_opt		= TT_BL_CFS;
+unsigned int __read_mostly tt_max_lifetime		= 22000; // in ms
+int __read_mostly tt_rt_prio				= -20;
+int __read_mostly tt_interactive_prio			= -10;
+int __read_mostly tt_cpu_bound_prio			= -15;
+int __read_mostly tt_batch_prio				=  19;
+int __read_mostly tt_lat_sens_enabled			=  1;
+int __read_mostly tt_dedicated_cpu_bound_enabled	=  1;
+
+#define MAX_HRRN				0xFFFFFFFFFFFFFFFULL
+#define YIELD_MARK(ttn)				((ttn)->vruntime |= 0x8000000000000000ULL)
+#define YIELD_UNMARK(ttn)			((ttn)->vruntime &= 0x7FFFFFFFFFFFFFFFULL)
+
+struct global_candidate {
+	struct rq *rq;
+	struct tt_node *candidate;
+	u64 hrrn;
+
+	// for update
+	raw_spinlock_t lock;
+};
+
+struct global_candidate global_candidate = {0, 0, MAX_HRRN};
+
+#define LOCK_GRQ(grf) ({ \
+	rq_lock_irqsave(grq, &(grf)); \
+	update_rq_clock(grq); \
+})
+
+#define UNLOCK_GRQ(grf) ({ \
+	rq_unlock(grq, &(grf)); \
+	local_irq_restore((grf).flags); \
+})
+
+#define INTERACTIVE_HRRN	2U
+#define RT_WAIT_DELTA		800000U
+#define RT_BURST_DELTA		2000000U
+#define RT_BURST_MAX		4000000U
+
+#define HZ_PERIOD (1000000000 / HZ)
+#define RACE_TIME 40000000
+#define FACTOR (RACE_TIME / HZ_PERIOD)
+
+#define IS_REALTIME(ttn)	((ttn)->task_type == TT_REALTIME)
+#define IS_INTERACTIVE(ttn)	((ttn)->task_type == TT_INTERACTIVE)
+#define IS_NO_TYPE(ttn)		((ttn)->task_type == TT_NO_TYPE)
+#define IS_CPU_BOUND(ttn)	((ttn)->task_type == TT_CPU_BOUND)
+#define IS_BATCH(ttn)		((ttn)->task_type == TT_BATCH)
+
+#define GEQ(a, b) ((s64)((a) - (b)) >= 0)	// is a >= b
+#define LEQ(a, b) ((s64)((a) - (b)) <= 0)	// is a <= b
+#define LES(a, b) ((s64)((a) - (b)) < 0)	// is a <  b
+#define EQ_D(a, b, d) (LEQ(a, b + d) && GEQ(a, b - d))
+
+#define HRRN_PERCENT(ttn, now) \
+	(((ttn)->vruntime * 1000ULL) / (((now) - (ttn)->start_time) | 1))
+
+#endif
+
 /*
  * After fork, child runs first. If set to 0 (default) then
  * parent will (try to) run first.
@@ -140,6 +200,13 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
+#ifdef CONFIG_TT_SCHED
+static inline struct sched_entity *se_of(struct tt_node *ttn)
+{
+	return container_of(ttn, struct sched_entity, tt_node);
+}
+#endif
+
 int sched_thermal_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -613,11 +680,23 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 	return min_vruntime;
 }
 
-static inline bool entity_before(const struct sched_entity *a,
-				 const struct sched_entity *b)
+#ifdef CONFIG_TT_SCHED
+/**
+ * Should `a` preempts `b`?
+ */
+static inline bool entity_before(struct tt_node *a, struct tt_node *b)
+{
+	u64 now = sched_clock();
+
+	return (s64)(HRRN_PERCENT(a, now) - HRRN_PERCENT(b, now)) < 0;
+}
+#else
+static inline bool entity_before(struct sched_entity *a,
+				 struct sched_entity *b)
 {
 	return (s64)(a->vruntime - b->vruntime) < 0;
 }
+#endif
 
 #define __node_2_se(node) \
 	rb_entry((node), struct sched_entity, run_node)
@@ -652,12 +731,73 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 {
+#ifdef CONFIG_TT_SCHED
+	return entity_before(&(__node_2_se(a)->tt_node), &(__node_2_se(b)->tt_node));
+#else
 	return entity_before(__node_2_se(a), __node_2_se(b));
+#endif
 }
 
+#ifdef CONFIG_TT_SCHED
 /*
  * Enqueue an entity into the rb-tree:
  */
+static void __enqueue_entity_cfs(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+}
+
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct tt_node *ttn = &se->tt_node;
+
+	ttn->next = ttn->prev = NULL;
+
+	// if empty
+	if (!cfs_rq->head) {
+		cfs_rq->head	= ttn;
+	}
+	else {
+		ttn->next	     = cfs_rq->head;
+		cfs_rq->head->prev   = ttn;
+		cfs_rq->head         = ttn;
+	}
+
+	__enqueue_entity_cfs(cfs_rq, se);
+}
+
+static void __dequeue_entity_cfs(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct tt_node *ttn = &se->tt_node;
+	struct tt_node *prev, *next;
+
+	// if only one se in rq
+	if (cfs_rq->head->next == NULL) {
+		cfs_rq->head = NULL;
+	}
+	// if it is the head
+	else if (ttn == cfs_rq->head) {
+		cfs_rq->head	   = cfs_rq->head->next;
+		cfs_rq->head->prev = NULL;
+	}
+	// if in the middle
+	else {
+		prev = ttn->prev;
+		next = ttn->next;
+
+		prev->next = next;
+		if (next)
+			next->prev = prev;
+	}
+
+	__dequeue_entity_cfs(cfs_rq, se);
+}
+#else
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
@@ -667,7 +807,58 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
 }
+#endif
+
+#ifdef CONFIG_TT_SCHED
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+{
+	struct tt_node *ttn = cfs_rq->head;
+	struct tt_node *next;
+
+	if (!ttn)
+		return NULL;
+
+	next = ttn->next;
+	while (next) {
+		if (entity_before(next, ttn))
+			ttn = next;
+
+		next = next->next;
+	}
+
+	return se_of(ttn);
+}
+
+static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct tt_node *ttn = cfs_rq->head;
+	struct tt_node *se_ttn;
+	struct tt_node *next;
+
+	if (!ttn)
+		return NULL;
+
+	if (!se)
+		return __pick_first_entity(cfs_rq);
+
+	se_ttn	= &se->tt_node;
+	next	= ttn->next;
+
+	while (next) {
+		if (next != se_ttn) {
+			if (ttn == se_ttn || entity_before(next, ttn))
+				ttn = next;
+		}
+
+		next = next->next;
+	}
+
+	if (ttn == se_ttn)
+		return NULL;
 
+	return se_of(ttn);
+}
+#else
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
@@ -687,6 +878,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 
 	return __node_2_se(next);
 }
+#endif
 
 #ifdef CONFIG_SCHED_DEBUG
 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
@@ -909,13 +1101,244 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_TT_SCHED
+static inline bool is_interactive(struct tt_node *ttn, u64 now, u64 _hrrn)
+{
+	u64 wait;
+
+	if (LES(_hrrn, (u64) INTERACTIVE_HRRN))
+		return false;
+
+	wait = now - se_of(ttn)->exec_start;
+	if (wait && EQ_D(wait, ttn->prev_wait_time, RT_WAIT_DELTA))
+		return false;
+
+	return true;
+}
+
+static inline bool is_realtime(struct tt_node *ttn, u64 now, int flags)
+{
+	u64 life_time, wait;
+
+	// it has slept at least once
+	if (!ttn->wait_time)
+		return false;
+
+	// life time >= 0.5s
+	life_time = now - task_of(se_of(ttn))->start_time;
+	if (LES(life_time, 500000000ULL))
+		return false;
+
+	// don't check wait time for migrated tasks
+	if (!(flags & ENQUEUE_MIGRATED)) {
+		/* it has relatively equal sleeping/waiting times
+		 * (ex. it sleeps for ~10ms and run repeatedly)
+		 */
+		wait = now - se_of(ttn)->exec_start;
+		if (wait && !EQ_D(wait, ttn->prev_wait_time, RT_WAIT_DELTA))
+			return false;
+	}
+
+	// bursts before sleep are relatively equal (delta 2ms)
+	if (!EQ_D(ttn->burst, ttn->prev_burst, RT_BURST_DELTA))
+		return false;
+
+	// burst before sleep is <= 4ms
+	if (LEQ(ttn->burst, RT_BURST_MAX) &&
+	    LEQ(ttn->curr_burst, RT_BURST_MAX))
+		return true;
+
+	return false;
+}
+
+static inline bool is_cpu_bound(struct tt_node *ttn)
+{
+	u64 _hrrn_percent;
+
+	_hrrn_percent = ttn->vruntime * 100ULL;
+	_hrrn_percent /= ttn->wait_time + ttn->vruntime;
+
+	// HRRN >= 90%
+	return (GEQ(_hrrn_percent, 90ULL));
+}
+
+static inline bool is_batch(struct tt_node *ttn, u64 _hrrn)
+{
+	// HRRN > 50%
+	return (LES(_hrrn, 2ULL));
+}
+
+static inline void set_dedicated_cpu_bound(struct cfs_rq *cfs_rq, struct tt_node *ttn)
+{
+	if (tt_dedicated_cpu_bound_enabled)
+		cfs_rq->dedicated_cpu_bound = ttn;
+}
+
+static inline void unset_dedicated_cpu_bound(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->dedicated_cpu_bound = NULL;
+}
+
+static void check_dedicated_cpu_bound(struct cfs_rq *cfs_rq)
+{
+	struct tt_node *ttn		= cfs_rq->head;
+	struct tt_node *curr_ttn	= &cfs_rq->curr->tt_node;
+	struct tt_node *d_ttn		= cfs_rq->dedicated_cpu_bound;
+
+	if (!tt_dedicated_cpu_bound_enabled)
+		return;
+
+	if (!d_ttn)
+		goto out;
+
+	if (curr_ttn == d_ttn)
+		return;
+
+	while (ttn) {
+		if (ttn == d_ttn)
+			return;
+
+		ttn = ttn->next;
+	}
+
+out:
+	unset_dedicated_cpu_bound(cfs_rq);
+}
+
+static void detect_type(struct cfs_rq *cfs_rq, struct tt_node *ttn, u64 now, int flags)
+{
+	unsigned int new_type = TT_NO_TYPE;
+	unsigned int old_type = ttn->task_type;
+	u64 _hrrn;
+	unsigned int cpu;
+
+	if (ttn->vruntime == 1) {
+		ttn->task_type = TT_NO_TYPE;
+		return;
+	}
+
+	_hrrn = (ttn->wait_time + ttn->vruntime) / ttn->vruntime;
+
+	if (is_realtime(ttn, now, flags))
+		new_type = TT_REALTIME;
+	else if (is_interactive(ttn, now, _hrrn))
+		new_type = TT_INTERACTIVE;
+	else if (is_cpu_bound(ttn))
+		new_type = TT_CPU_BOUND;
+	else if (is_batch(ttn, _hrrn))
+		new_type = TT_BATCH;
+
+	if (new_type == TT_REALTIME) {
+		ttn->rt_sticky = 4;
+	} else if (IS_REALTIME(ttn) && ttn->rt_sticky) {
+		ttn->rt_sticky--;
+		return;
+	}
+
+	if (new_type != old_type) {
+		cpu = task_cpu(task_of(se_of(ttn)));
+
+		/*
+		 * Recall:
+		 * TT_REALTIME		0
+		 * TT_INTERACTIVE	1
+		 * TT_NO_TYPE		2
+		 * TT_CPU_BOUND		3
+		 * TT_BATCH		4
+		 */
+
+		/* set latency sensitive */
+		if (new_type <= TT_INTERACTIVE && old_type > TT_INTERACTIVE)
+			inc_nr_lat_sensitive(cpu);
+		else if (old_type <= TT_INTERACTIVE && new_type > TT_INTERACTIVE)
+			dec_nr_lat_sensitive(cpu);
+
+		/* set dedicated cpu bound */
+		if (!cfs_rq->dedicated_cpu_bound && new_type == TT_CPU_BOUND)
+			set_dedicated_cpu_bound(cfs_rq, ttn);
+		else if (cfs_rq->dedicated_cpu_bound == ttn && old_type == TT_CPU_BOUND)
+			unset_dedicated_cpu_bound(cfs_rq);
+	}
+
+	ttn->task_type = new_type;
+}
+
+static void normalize_lifetime(u64 now, struct tt_node *ttn)
+{
+	u64 max_life_ns, life_time, old_hrrn_x;
+	s64 diff;
+
+	/*
+	 * left shift 20 bits is approximately = * 1000000
+	 * we don't need the precision of life time
+	 * Ex. for 22s, with left shift (20bits) == 23.06s
+	 */
+	max_life_ns	= ((u64) tt_max_lifetime) << 20;
+	life_time	= now - ttn->start_time;
+	diff		= life_time - max_life_ns;
+
+	if (likely(diff < 0))
+		return;
+
+	// unmark YIELD. No need to check or remark since
+	// this normalize action doesn't happen very often
+	YIELD_UNMARK(ttn);
+
+	// multiply life_time by 1024 for more precision
+	old_hrrn_x = (life_time << 7) / ((ttn->vruntime >> 3) | 1);
+
+	// reset life to half max_life (i.e ~15s)
+	ttn->start_time = now - (max_life_ns >> 1);
+
+	// avoid division by zero
+	if (old_hrrn_x == 0) old_hrrn_x = 1;
+
+	// reset vruntime based on old hrrn ratio
+	ttn->vruntime = ((max_life_ns << 9) / old_hrrn_x) | 1;
+}
+
+static u64 convert_to_vruntime(u64 delta, struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	s64 prio_diff;
+	int prio = PRIO_TO_NICE(p->prio);
+
+	if (IS_REALTIME(&se->tt_node))
+		prio = tt_rt_prio;
+	else if (IS_INTERACTIVE(&se->tt_node))
+		prio = tt_interactive_prio;
+	else if (IS_CPU_BOUND(&se->tt_node))
+		prio = tt_cpu_bound_prio;
+	else if (IS_BATCH(&se->tt_node))
+		prio = tt_batch_prio;
+
+	if (prio == 0)
+		return delta;
+
+	prio_diff = prio * 1000000;
+	prio_diff /= FACTOR;
+
+	if ((s64)(delta + prio_diff) < 0)
+		return 1;
+
+	return delta + prio_diff;
+}
+
+static void update_candidate(struct cfs_rq *cfs_rq);
+#endif
+
 /*
  * Update the current task's runtime statistics.
  */
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
+#ifdef CONFIG_TT_SCHED
+	struct tt_node *ttn = &curr->tt_node;
+	u64 now = sched_clock();
+#else
 	u64 now = rq_clock_task(rq_of(cfs_rq));
+#endif
 	u64 delta_exec;
 
 	if (unlikely(!curr))
@@ -941,6 +1364,18 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_min_vruntime(cfs_rq);
 
+#ifdef CONFIG_TT_SCHED
+	ttn->curr_burst	+= delta_exec;
+	ttn->vruntime	+= convert_to_vruntime(delta_exec, curr);
+	detect_type(cfs_rq, ttn, now, 0);
+	normalize_lifetime(now, &curr->tt_node);
+
+	if (IS_CAND_BL_ENABLED) {
+		cfs_rq->local_cand_hrrn = HRRN_PERCENT(&curr->tt_node, now);
+		update_candidate(cfs_rq);
+	}
+#endif
+
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
@@ -1075,7 +1510,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * We are starting a new run period:
 	 */
+#ifdef CONFIG_TT_SCHED
+	se->exec_start = sched_clock();
+#else
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
+#endif
 }
 
 /**************************************************
@@ -4787,6 +5226,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
 	bool curr = cfs_rq->curr == se;
 
+#ifdef CONFIG_TT_SCHED
+	struct tt_node *ttn = &se->tt_node;
+	bool wakeup = (flags & ENQUEUE_WAKEUP);
+	u64 now = sched_clock();
+	u64 wait;
+
+	if (wakeup) {
+		wait = now - se->exec_start;
+		ttn->wait_time += wait;
+		detect_type(cfs_rq, ttn, now, flags);
+
+		ttn->prev_wait_time = wait;
+	} else {
+		detect_type(cfs_rq, ttn, now, flags);
+	}
+#endif
 	/*
 	 * If we're the current task, we must renormalise before calling
 	 * update_curr().
@@ -4886,10 +5341,135 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
+#ifdef CONFIG_TT_SCHED
+static inline int clear_this_candidate(struct sched_entity *se)
+{
+	struct tt_node *ttn = &se->tt_node;
+	struct tt_node *curr_can = READ_ONCE(global_candidate.candidate);
+
+	if (ttn != curr_can)
+		return 0;
+
+	WRITE_ONCE(global_candidate.candidate, NULL);
+	WRITE_ONCE(global_candidate.rq, NULL);
+	WRITE_ONCE(global_candidate.hrrn, MAX_HRRN);
+
+	return 1;
+}
+
+static inline void clear_rq_candidate(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = READ_ONCE(global_candidate.rq);
+
+	if (rq != rq_of(cfs_rq))
+		return;
+
+	WRITE_ONCE(global_candidate.candidate, NULL);
+	WRITE_ONCE(global_candidate.rq, NULL);
+	WRITE_ONCE(global_candidate.hrrn, MAX_HRRN);
+}
+
+static inline bool
+can_be_candidate(struct tt_node *ttn, int this_cpu)
+{
+	struct task_struct *p = task_of(se_of(ttn));
+
+	if (kthread_is_per_cpu(p))
+		return 0;
+
+	/*
+	 * only realtime and interactive can
+	 * be candidates
+	 */
+	if (ttn->task_type > TT_INTERACTIVE)
+		return 0;
+
+	// just migrated
+	if (p->se.avg.last_update_time == 0)
+		return 0;
+
+	if (task_on_cpu(cpu_rq(this_cpu), p))
+		return 0;
+
+	// some tasks are pinned to this cpu
+	if (p->nr_cpus_allowed <= 1)
+		return 0;
+
+	if (is_migration_disabled(p))
+		return 0;
+
+	return 1;
+}
+
+static void __update_candidate(struct cfs_rq *cfs_rq, struct tt_node *ttn)
+{
+	unsigned long flags;
+	u64 hrrn, curr_can_hrrn;
+
+	curr_can_hrrn = READ_ONCE(global_candidate.hrrn);
+	hrrn = HRRN_PERCENT(ttn, sched_clock());
+
+	if ((s64)(hrrn - curr_can_hrrn) < 0) {
+		raw_spin_lock_irqsave(&global_candidate.lock, flags);
+		global_candidate.rq = rq_of(cfs_rq);
+		global_candidate.candidate = ttn;
+		global_candidate.hrrn = hrrn;
+		raw_spin_unlock_irqrestore(&global_candidate.lock, flags);
+	}
+}
+
+static void update_candidate(struct cfs_rq *cfs_rq)
+{
+	struct tt_node *ttn = cfs_rq->head;
+	struct tt_node *next;
+	int this_cpu = cpu_of(rq_of(cfs_rq));
+
+	while (ttn && !can_be_candidate(ttn, this_cpu))
+		ttn = ttn->next;
+
+	if (!ttn) {
+		clear_rq_candidate(cfs_rq);
+		return;
+	}
+
+	next = ttn->next;
+	while (next) {
+		if (can_be_candidate(next, this_cpu) && entity_before(next, ttn))
+			ttn = next;
+
+		next = next->next;
+	}
+
+	__update_candidate(cfs_rq, ttn);
+}
+#endif
+
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	int action = UPDATE_TG;
+#ifdef CONFIG_TT_SCHED
+	struct tt_node *ttn = &se->tt_node;
+	bool sleep = (flags & DEQUEUE_SLEEP);
+	u64 avg_wait;
+
+	if (sleep) {
+		ttn->prev_burst = ttn->burst;
+		ttn->burst = ttn->curr_burst;
+		ttn->curr_burst = 0;
+
+		if (IS_CPU_BOUND(ttn))
+			ttn->task_type = TT_BATCH;
+		else if (IS_REALTIME(ttn)) {
+			avg_wait = ttn->prev_wait_time;
+			avg_wait += ttn->wait_time;
+			avg_wait /= 2ULL;
+
+			if (LEQ(avg_wait, HZ_PERIOD))
+				inc_nr_lat_sensitive(cpu_of(rq_of(cfs_rq)));
+		}
+	}
+#endif
 
 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
 		action |= DO_DETACH;
@@ -4917,6 +5497,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
+
+#ifdef CONFIG_TT_SCHED
+	if (IS_CAND_BL_ENABLED && clear_this_candidate(se))
+		update_candidate(cfs_rq);
+#endif
+
 	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
 
@@ -4966,7 +5552,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
+#ifdef CONFIG_TT_SCHED
+		if (IS_CAND_BL_ENABLED) {
+			cfs_rq->local_cand_hrrn = 0;
+			clear_rq_candidate(cfs_rq);
+		}
+#endif
 		resched_curr(rq_of(cfs_rq));
+
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
@@ -4980,22 +5573,55 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * narrow margin doesn't have to wait for a full slice.
 	 * This also mitigates buddy induced latencies under load.
 	 */
+#ifdef CONFIG_TT_SCHED
+	if (delta_exec < sysctl_sched_min_granularity) {
+		if (IS_CAND_BL_ENABLED)
+			clear_this_candidate(curr);
+		return;
+	}
+#else
 	if (delta_exec < sysctl_sched_min_granularity)
 		return;
+#endif
 
 	se = __pick_first_entity(cfs_rq);
 	delta = curr->vruntime - se->vruntime;
 
+#ifdef CONFIG_TT_SCHED
+	if (entity_before(&curr->tt_node, &se->tt_node)) {
+		if (IS_CAND_BL_ENABLED)
+			clear_this_candidate(curr);
+		return;
+	}
+#else
 	if (delta < 0)
 		return;
+#endif
+
+#ifdef CONFIG_TT_SCHED
+	if (delta > ideal_runtime) {
+		if (IS_CAND_BL_ENABLED) {
+			clear_this_candidate(se);
+			cfs_rq->local_cand_hrrn = HRRN_PERCENT(&se->tt_node, sched_clock());
+		}
 
+		resched_curr(rq_of(cfs_rq));
+	} else if (IS_CAND_BL_ENABLED) {
+		clear_this_candidate(curr);
+	}
+#else
 	if (delta > ideal_runtime)
 		resched_curr(rq_of(cfs_rq));
+#endif
 }
 
 static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+#ifdef CONFIG_TT_SCHED
+	if (IS_CAND_BL_ENABLED)
+		clear_this_candidate(se);
+#endif
 	clear_buddies(cfs_rq, se);
 
 	/* 'current' is not kept within the tree. */
@@ -5010,6 +5636,13 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
 
+#ifdef CONFIG_TT_SCHED
+	if (IS_CAND_BL_ENABLED)	{
+		update_candidate(cfs_rq);
+		cfs_rq->local_cand_hrrn = HRRN_PERCENT(&se->tt_node, sched_clock());
+	}
+#endif
+
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
 
@@ -5051,7 +5684,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * If curr is set we have to see if its left of the leftmost entity
 	 * still in the tree, provided there was anything in the tree at all.
 	 */
-	if (!left || (curr && entity_before(curr, left)))
+	if (!left || (curr && entity_before(&curr->tt_node, &left->tt_node)))
 		left = curr;
 
 	se = left; /* ideally we run the leftmost entity */
@@ -5066,8 +5699,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		if (se == curr) {
 			second = __pick_first_entity(cfs_rq);
 		} else {
-			second = __pick_next_entity(se);
-			if (!second || (curr && entity_before(curr, second)))
+			second = __pick_next_entity(cfs_rq, se);
+			if (!second || (curr && entity_before(&curr->tt_node, &second->tt_node)))
 				second = curr;
 		}
 
@@ -5124,6 +5757,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_TT_SCHED
+	check_dedicated_cpu_bound(cfs_rq);
+#endif
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
@@ -5149,6 +5785,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
 	if (cfs_rq->nr_running > 1)
 		check_preempt_tick(cfs_rq, curr);
+#ifdef CONFIG_TT_SCHED
+	else if (IS_CAND_BL_ENABLED)
+		clear_rq_candidate(cfs_rq);
+#endif
 }
 
 
@@ -6309,6 +6949,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			goto enqueue_throttle;
 	}
 
+#ifdef CONFIG_TT_SCHED
+	if (IS_CAND_BL_ENABLED)
+		update_candidate(cfs_rq);
+#endif
+
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
 
@@ -6677,6 +7322,21 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 }
 
+#ifdef CONFIG_TT_SCHED
+static int
+wake_affine_tt(struct task_struct *p, int this_cpu, int prev_cpu, int sync)
+{
+	int target = nr_cpumask_bits;
+
+	target = wake_affine_idle(this_cpu, prev_cpu, sync);
+
+	if (target == nr_cpumask_bits)
+		return prev_cpu;
+
+	return target;
+}
+#endif
+
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int this_cpu, int prev_cpu, int sync)
 {
@@ -7678,15 +8338,31 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	int want_affine = 0;
 	/* SD_flags and WF_flags share the first nibble */
 	int sd_flag = wake_flags & 0xF;
+#ifdef CONFIG_TT_SCHED
+	struct rq *rq = cpu_rq(prev_cpu);
+	unsigned int min_prev = rq->nr_running;
+	unsigned int min = rq->nr_running;
+	int this_cpu = smp_processor_id();
+#endif
+
 
 	/*
 	 * required for stable ->cpus_allowed
 	 */
 	lockdep_assert_held(&p->pi_lock);
+
+#ifdef CONFIG_TT_SCHED
+	if (IS_PWR_BL_ENABLED || (wake_flags & WF_TTWU)) {
+#else
 	if (wake_flags & WF_TTWU) {
+#endif
 		record_wakee(p);
 
+#ifdef CONFIG_TT_SCHED
+		if (IS_PWR_BL_ENABLED || sched_energy_enabled()) {
+#else
 		if (sched_energy_enabled()) {
+#endif
 			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
 			if (new_cpu >= 0)
 				return new_cpu;
@@ -7696,41 +8372,66 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
 	}
 
-	rcu_read_lock();
-	for_each_domain(cpu, tmp) {
-		/*
-		 * If both 'cpu' and 'prev_cpu' are part of this domain,
-		 * cpu is a valid SD_WAKE_AFFINE target.
-		 */
-		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-			if (cpu != prev_cpu)
-				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+#ifdef CONFIG_TT_SCHED
+	if (!(IS_CFS_BL_ENABLED)) {
+		for_each_cpu_wrap(cpu, cpu_online_mask, this_cpu) {
+			if (unlikely(!cpumask_test_cpu(cpu, p->cpus_ptr)))
+				continue;
 
-			sd = NULL; /* Prefer wake_affine over balance flags */
-			break;
+			if (want_affine) {
+				if (cpu != prev_cpu)
+					new_cpu = wake_affine_tt(p, cpu, prev_cpu, sync);
+
+				return new_cpu;
+			}
+
+			if (cpu_rq(cpu)->nr_running < min) {
+				new_cpu = cpu;
+				min = cpu_rq(cpu)->nr_running;
+			}
 		}
 
-		/*
-		 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
-		 * usually do not have SD_BALANCE_WAKE set. That means wakeup
-		 * will usually go to the fast path.
-		 */
-		if (tmp->flags & sd_flag)
-			sd = tmp;
-		else if (!want_affine)
-			break;
-	}
+		if (min == min_prev)
+			return prev_cpu;
+	} else {
+#endif
+		rcu_read_lock();
+		for_each_domain(cpu, tmp) {
+			/*
+			 * If both 'cpu' and 'prev_cpu' are part of this domain,
+			 * cpu is a valid SD_WAKE_AFFINE target.
+			 */
+			if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+			    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+				if (cpu != prev_cpu)
+					new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
 
-	if (unlikely(sd)) {
-		/* Slow path */
-		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
-	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
-		/* Fast path */
-		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-	}
-	rcu_read_unlock();
+				sd = NULL; /* Prefer wake_affine over balance flags */
+				break;
+			}
 
+			/*
+			 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+			 * usually do not have SD_BALANCE_WAKE set. That means wakeup
+			 * will usually go to the fast path.
+			 */
+			if (tmp->flags & sd_flag)
+				sd = tmp;
+			else if (!want_affine)
+				break;
+		}
+
+		if (unlikely(sd)) {
+			/* Slow path */
+			new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+		} else if (wake_flags & WF_TTWU) { /* XXX always ? */
+			/* Fast path */
+			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+		}
+		rcu_read_unlock();
+#ifdef CONFIG_TT_SCHED
+	}
+#endif
 	return new_cpu;
 }
 
@@ -7774,6 +8475,10 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	/* Tell new CPU we are migrated */
 	se->avg.last_update_time = 0;
 
+#ifdef CONFIG_TT_SCHED
+	YIELD_UNMARK(&p->se.tt_node);
+#endif
+
 	update_scan_period(p, new_cpu);
 }
 
@@ -8010,6 +8715,17 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	struct task_struct *p;
 	int new_tasks;
 
+#ifdef CONFIG_TT_SCHED
+	if (IS_CAND_BL_ENABLED) {
+		/*
+		 * to cpu0, don't push any
+		 * candidates to this rq
+		 */
+		cfs_rq->local_cand_hrrn = 0;
+		clear_rq_candidate(cfs_rq);
+	}
+#endif
+
 again:
 	if (!sched_fair_runnable(rq))
 		goto idle;
@@ -8103,6 +8819,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 	p = task_of(se);
 
+#ifdef CONFIG_TT_SCHED
+	if (prev)
+		YIELD_UNMARK(&prev->se.tt_node);
+#endif
+
 done: __maybe_unused;
 #ifdef CONFIG_SMP
 	/*
@@ -8121,6 +8842,11 @@ done: __maybe_unused;
 	return p;
 
 idle:
+#ifdef CONFIG_TT_SCHED
+	if (IS_CAND_BL_ENABLED)
+		cfs_rq->local_cand_hrrn = MAX_HRRN;
+#endif
+
 	if (!rf)
 		return NULL;
 
@@ -8184,6 +8910,11 @@ static void yield_task_fair(struct rq *rq)
 
 	clear_buddies(cfs_rq, se);
 
+#ifdef CONFIG_TT_SCHED
+	if (cfs_rq->h_nr_running > 1)
+		YIELD_MARK(&curr->se.tt_node);
+#endif
+
 	if (curr->policy != SCHED_BATCH) {
 		update_rq_clock(rq);
 		/*
@@ -8421,6 +9152,47 @@ struct lb_env {
 /*
  * Is this task likely cache-hot:
  */
+#ifdef CONFIG_TT_SCHED
+static int task_hot_tt(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	s64 delta;
+
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	if (unlikely(task_has_idle_policy(p)))
+		return 0;
+
+	/* SMT siblings share cache */
+	if (cpus_share_cache(cpu_of(dst_rq), cpu_of(src_rq)))
+		return 0;
+
+	/*
+	 * Buddy candidates are cache hot:
+	 */
+	if (sched_feat(CACHE_HOT_BUDDY) && dst_rq->nr_running &&
+			(&p->se == cfs_rq_of(&p->se)->next ||
+			 &p->se == cfs_rq_of(&p->se)->last))
+		return 1;
+
+	if (sysctl_sched_migration_cost == -1)
+		return 1;
+
+	/*
+	 * Don't migrate task if the task's cookie does not match
+	 * with the destination CPU's core cookie.
+	 */
+	if (!sched_core_cookie_match(dst_rq, p))
+		return 1;
+
+	if (sysctl_sched_migration_cost == 0)
+		return 0;
+
+	delta = sched_clock() - p->se.exec_start;
+
+	return delta < (s64)sysctl_sched_migration_cost;
+}
+#endif
 static int task_hot(struct task_struct *p, struct lb_env *env)
 {
 	s64 delta;
@@ -8458,12 +9230,62 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 
+#ifdef CONFIG_TT_SCHED
+	delta = sched_clock() - p->se.exec_start;
+#else
 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+#endif
 
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
 #ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_TT_SCHED
+static int
+migrate_degrades_locality_tt(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+	unsigned long src_weight, dst_weight;
+	int src_nid, dst_nid, dist;
+
+	if (!static_branch_likely(&sched_numa_balancing))
+		return -1;
+
+	src_nid = cpu_to_node(cpu_of(src_rq));
+	dst_nid = cpu_to_node(cpu_of(dst_rq));
+
+	if (src_nid == dst_nid)
+		return -1;
+
+	/* Migrating away from the preferred node is always bad. */
+	if (src_nid == p->numa_preferred_nid) {
+		if (src_rq->nr_running > src_rq->nr_preferred_running)
+			return 1;
+		else
+			return -1;
+	}
+
+	/* Encourage migration to the preferred node. */
+	if (dst_nid == p->numa_preferred_nid)
+		return 0;
+
+	/* Leaving a core idle is often worse than degrading locality. */
+	if (dst_rq->idle_balance)
+		return -1;
+
+	dist = node_distance(src_nid, dst_nid);
+	if (numa_group) {
+		src_weight = group_weight(p, src_nid, dist);
+		dst_weight = group_weight(p, dst_nid, dist);
+	} else {
+		src_weight = task_weight(p, src_nid, dist);
+		dst_weight = task_weight(p, dst_nid, dist);
+	}
+
+	return dst_weight < src_weight;
+}
+#endif
+
 /*
  * Returns 1, if task migration degrades locality
  * Returns 0, if task migration improves locality i.e migration preferred.
@@ -8516,6 +9338,13 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 }
 
 #else
+#if CONFIG_TT_SCHED
+static int migrate_degrades_locality_tt(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	return -1;
+}
+#endif
+
 static inline int migrate_degrades_locality(struct task_struct *p,
 					     struct lb_env *env)
 {
@@ -8523,6 +9352,66 @@ static inline int migrate_degrades_locality(struct task_struct *p,
 }
 #endif
 
+#ifdef CONFIG_TT_SCHED
+static int
+can_migrate_task_powersave(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	int tsk_cache_hot;
+
+	/* Disregard pcpu kthreads; they are where they need to be. */
+	if (kthread_is_per_cpu(p))
+		return 0;
+
+	if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr))
+		return 0;
+
+	if (task_on_cpu(src_rq, p))
+		return 0;
+
+	tsk_cache_hot = migrate_degrades_locality_tt(p, dst_rq, src_rq);
+	if (tsk_cache_hot == -1)
+		tsk_cache_hot = task_hot_tt(p, dst_rq, src_rq);
+
+	if (tsk_cache_hot > 0)
+		return 0;
+
+	if (p->se.tt_node.task_type < TT_CPU_BOUND)
+		return 0;
+
+	return 1;
+}
+
+static int
+can_migrate_task_tt(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	int tsk_cache_hot;
+
+	/* Disregard pcpu kthreads; they are where they need to be. */
+	if (kthread_is_per_cpu(p))
+		return 0;
+
+	if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr))
+		return 0;
+
+	/* if this a cpu bound task and dedicated to this cpus, then no*/
+	if (tt_dedicated_cpu_bound_enabled &&
+			src_rq->cfs.dedicated_cpu_bound == &p->se.tt_node &&
+			cpumask_test_cpu(cpu_of(src_rq), p->cpus_ptr))
+		return 0;
+
+	if (task_on_cpu(src_rq, p))
+		return 0;
+
+	tsk_cache_hot = migrate_degrades_locality_tt(p, dst_rq, src_rq);
+	if (tsk_cache_hot == -1)
+		tsk_cache_hot = task_hot_tt(p, dst_rq, src_rq);
+
+	if (tsk_cache_hot > 0)
+		return 0;
+
+	return 1;
+}
+#endif
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -8580,6 +9469,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		return 0;
 	}
 
+#ifdef CONFIG_TT_SCHED
+	/* if this a cpu bound task and dedicated to this cpus, then no*/
+	if (tt_dedicated_cpu_bound_enabled &&
+			env->src_rq->cfs.dedicated_cpu_bound == &p->se.tt_node &&
+			cpumask_test_cpu(cpu_of(env->src_rq), p->cpus_ptr))
+		return 0;
+#endif
+
 	/* Record that we found at least one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 
@@ -11575,6 +12472,62 @@ static bool update_nohz_stats(struct rq *rq)
 	return rq->has_blocked_load;
 }
 
+#ifdef CONFIG_TT_SCHED
+static int move_task(struct rq *dist_rq, struct rq *src_rq, struct rq_flags *src_rf);
+
+static void idle_balance(struct rq *this_rq)
+{
+	int this_cpu = this_rq->cpu;
+	struct rq *src_rq;
+	int src_cpu = -1, cpu;
+	unsigned int max = 0;
+	struct rq_flags src_rf;
+
+	if (IS_CAND_BL_ENABLED) {
+		if (idle_pull_global_candidate(this_rq))
+			return;
+	} else if (IS_PWR_BL_ENABLED)
+		return;
+
+	for_each_online_cpu(cpu) {
+		/*
+		 * Stop searching for tasks to pull if there are
+		 * now runnable tasks on this rq.
+		 */
+		if (this_rq->nr_running > 0)
+			return;
+
+		if (cpu == this_cpu)
+			continue;
+
+		src_rq = cpu_rq(cpu);
+
+		if (src_rq->nr_running < 2)
+			continue;
+
+		if (src_rq->nr_running > max) {
+			max = src_rq->nr_running;
+			src_cpu = cpu;
+		}
+	}
+
+	if (src_cpu == -1)
+		return;
+
+	src_rq = cpu_rq(src_cpu);
+
+	rq_lock_irqsave(src_rq, &src_rf);
+	update_rq_clock(src_rq);
+
+	if (src_rq->nr_running < 2) {
+		rq_unlock(src_rq, &src_rf);
+		local_irq_restore(src_rf.flags);
+	} else {
+		move_task(this_rq, src_rq, &src_rf);
+	}
+}
+#endif
+
 /*
  * Internal function that runs load balance for all idle cpus. The load balance
  * can be a simple update of blocked load or a complete load balance with
@@ -11651,8 +12604,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
 			update_rq_clock(rq);
 			rq_unlock_irqrestore(rq, &rf);
 
+#ifdef CONFIG_TT_SCHED
+			if (flags & NOHZ_BALANCE_KICK) {
+				if (!(IS_CFS_BL_ENABLED))
+					idle_balance(rq);
+				else
+					rebalance_domains(rq, CPU_IDLE);
+			}
+#else
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
+#endif
 		}
 
 		if (time_after(next_balance, rq->next_balance)) {
@@ -11756,6 +12718,64 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
 static inline void nohz_newidle_balance(struct rq *this_rq) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
+#ifdef CONFIG_TT_SCHED
+static void update_curr_lightweight(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	struct tt_node *ttn = &curr->tt_node;
+	u64 now = sched_clock();
+	u64 delta_exec;
+
+	if (!curr)
+		return;
+
+	delta_exec = now - curr->exec_start;
+	if (unlikely((s64)delta_exec <= 0))
+		return;
+
+	curr->exec_start = now;
+	curr->sum_exec_runtime += delta_exec;
+	curr->vruntime += calc_delta_fair(delta_exec, curr);
+	update_min_vruntime(cfs_rq);
+
+	ttn->curr_burst += delta_exec;
+	ttn->vruntime += convert_to_vruntime(delta_exec, curr);
+	cfs_rq->local_cand_hrrn = HRRN_PERCENT(&curr->tt_node, now);
+}
+
+static void active_pull_global_candidate(struct rq *dist_rq);
+
+static void nohz_try_pull_from_candidate(void)
+{
+	int cpu;
+	struct rq *rq;
+	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_NO_HZ_FULL
+	struct rq_flags rf;
+#endif
+
+	/* first, push to grq*/
+	for_each_online_cpu(cpu) {
+		rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ_FULL
+		cfs_rq = &rq->cfs;
+
+		if (idle_cpu(cpu) || cfs_rq->nr_running > 1)
+			goto out;
+
+		rq_lock_irqsave(rq, &rf);
+		update_rq_clock(rq);
+		update_curr_lightweight(cfs_rq);
+		rq_unlock_irqrestore(rq, &rf);
+out:
+#endif
+		if (idle_cpu(cpu) || !sched_fair_runnable(rq))
+			idle_pull_global_candidate(rq);
+		else
+			active_pull_global_candidate(rq);
+	}
+}
+
 /*
  * newidle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -11765,7 +12785,10 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
  *     0 - failed, no new tasks
  *   > 0 - success, new (fair) tasks present
  */
+static int newidle_balance_cfs(struct rq *this_rq, struct rq_flags *rf)
+#else
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+#endif
 {
 	unsigned long next_balance = jiffies + HZ;
 	int this_cpu = this_rq->cpu;
@@ -11887,6 +12910,327 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	return pulled_task;
 }
 
+#ifdef CONFIG_TT_SCHED
+static void pull_from(struct rq *dist_rq,
+		      struct rq *src_rq,
+		      struct rq_flags *src_rf,
+		      struct task_struct *p)
+{
+	struct rq_flags rf;
+
+	// detach task
+	deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+	set_task_cpu(p, cpu_of(dist_rq));
+
+	// unlock src rq
+	rq_unlock(src_rq, src_rf);
+
+	// lock dist rq
+	rq_lock(dist_rq, &rf);
+	update_rq_clock(dist_rq);
+
+	activate_task(dist_rq, p, ENQUEUE_NOCLOCK);
+	check_preempt_curr(dist_rq, p, 0);
+
+	// unlock dist rq
+	rq_unlock(dist_rq, &rf);
+
+	local_irq_restore(src_rf->flags);
+}
+
+static int move_task_powersave(struct rq *dist_rq, struct rq *src_rq,
+			struct rq_flags *src_rf)
+{
+	struct cfs_rq *src_cfs_rq = &src_rq->cfs;
+	struct task_struct *p;
+	struct tt_node *ttn = src_cfs_rq->head;
+
+	while (ttn) {
+		p = task_of(se_of(ttn));
+		if (can_migrate_task_powersave(p, dist_rq, src_rq)) {
+			pull_from(dist_rq, src_rq, src_rf, p);
+			return 1;
+		}
+
+		ttn = ttn->next;
+	}
+
+	/*
+	 * Here we know we have not migrated any task,
+	 * thus, we need to unlock and return 0
+	 * Note: the pull_from does the unlocking for us.
+	 */
+	rq_unlock(src_rq, src_rf);
+	local_irq_restore(src_rf->flags);
+
+	return 0;
+}
+
+static int move_task(struct rq *dist_rq, struct rq *src_rq,
+			struct rq_flags *src_rf)
+{
+	struct cfs_rq *src_cfs_rq = &src_rq->cfs;
+	struct task_struct *p;
+	struct tt_node *ttn = src_cfs_rq->head;
+
+	while (ttn) {
+		p = task_of(se_of(ttn));
+		if (can_migrate_task_tt(p, dist_rq, src_rq)) {
+			pull_from(dist_rq, src_rq, src_rf, p);
+			return 1;
+		}
+
+		ttn = ttn->next;
+	}
+
+	/*
+	 * Here we know we have not migrated any task,
+	 * thus, we need to unlock and return 0
+	 * Note: the pull_from does the unlocking for us.
+	 */
+	rq_unlock(src_rq, src_rf);
+	local_irq_restore(src_rf->flags);
+
+	return 0;
+}
+
+static int
+can_migrate_candidate(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	/* Disregard pcpu kthreads; they are where they need to be. */
+	if (kthread_is_per_cpu(p))
+		return 0;
+
+	if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr))
+		return 0;
+
+	if (task_on_cpu(src_rq, p))
+		return 0;
+
+	return 1;
+}
+
+int idle_pull_global_candidate(struct rq *dist_rq)
+{
+	struct rq *src_rq;
+	struct task_struct *p;
+	struct rq_flags rf, src_rf;
+	struct tt_node *cand = READ_ONCE(global_candidate.candidate);
+
+	if (!cand)
+		return 0;
+
+	src_rq = READ_ONCE(global_candidate.rq);
+	if (!src_rq || src_rq == dist_rq)
+		return 0;
+
+	rq_lock_irqsave(src_rq, &src_rf);
+	update_rq_clock(src_rq);
+		raw_spin_lock(&global_candidate.lock);
+			cand = global_candidate.candidate;
+			if (!cand)
+				goto fail_unlock;
+
+			p = task_of(se_of(cand));
+			if (task_rq(p) != src_rq ||
+			    !can_migrate_candidate(p, dist_rq, src_rq))
+				goto fail_unlock;
+
+			global_candidate.rq = NULL;
+			global_candidate.candidate = NULL;
+			global_candidate.hrrn = MAX_HRRN;
+		raw_spin_unlock(&global_candidate.lock);
+
+		// detach task
+		deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+		set_task_cpu(p, cpu_of(dist_rq));
+	// unlock src rq
+	rq_unlock(src_rq, &src_rf);
+
+	// lock dist rq
+	rq_lock(dist_rq, &rf);
+	update_rq_clock(dist_rq);
+		activate_task(dist_rq, p, ENQUEUE_NOCLOCK);
+		check_preempt_curr(dist_rq, p, 0);
+	// unlock dist rq
+	rq_unlock(dist_rq, &rf);
+
+	local_irq_restore(src_rf.flags);
+
+	return 1;
+
+fail_unlock:
+	raw_spin_unlock(&global_candidate.lock);
+	rq_unlock(src_rq, &src_rf);
+	local_irq_restore(src_rf.flags);
+	return 0;
+}
+
+static void active_pull_global_candidate(struct rq *dist_rq)
+{
+	struct cfs_rq *cfs_rq = &dist_rq->cfs;
+	u64 cand_hrrn = READ_ONCE(global_candidate.hrrn);
+	u64 local_hrrn = READ_ONCE(cfs_rq->local_cand_hrrn);
+	struct rq *src_rq;
+	struct task_struct *p;
+	struct rq_flags rf, src_rf;
+	struct tt_node *cand;
+
+	cand = READ_ONCE(global_candidate.candidate);
+
+	if (!cand)
+		return;
+
+	if ((s64)(local_hrrn - cand_hrrn) <= 0)
+		return;
+
+	src_rq = READ_ONCE(global_candidate.rq);
+	if (!src_rq || src_rq == dist_rq)
+		return;
+
+	rq_lock_irqsave(src_rq, &src_rf);
+	update_rq_clock(src_rq);
+		raw_spin_lock(&global_candidate.lock);
+			cand = global_candidate.candidate;
+			cand_hrrn = global_candidate.hrrn;
+
+			if (!cand)
+				goto fail_unlock;
+
+			p = task_of(se_of(cand));
+			if (task_rq(p) != src_rq ||
+			    !can_migrate_candidate(p, dist_rq, src_rq))
+				goto fail_unlock;
+
+			if ((s64)(local_hrrn - cand_hrrn) <= 0)
+				goto fail_unlock;
+
+			global_candidate.rq = NULL;
+			global_candidate.candidate = NULL;
+			global_candidate.hrrn = MAX_HRRN;
+		raw_spin_unlock(&global_candidate.lock);
+
+		// detach task
+		deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+		set_task_cpu(p, cpu_of(dist_rq));
+	// unlock src rq
+	rq_unlock(src_rq, &src_rf);
+
+	// lock dist rq
+	rq_lock(dist_rq, &rf);
+	update_rq_clock(dist_rq);
+		activate_task(dist_rq, p, ENQUEUE_NOCLOCK);
+		check_preempt_curr(dist_rq, p, 0);
+	// unlock dist rq
+	rq_unlock(dist_rq, &rf);
+
+	local_irq_restore(src_rf.flags);
+
+	return;
+
+fail_unlock:
+	raw_spin_unlock(&global_candidate.lock);
+	rq_unlock(src_rq, &src_rf);
+	local_irq_restore(src_rf.flags);
+}
+
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+	int this_cpu = this_rq->cpu;
+	struct rq *src_rq;
+	int src_cpu = -1, cpu;
+	int pulled_task = 0;
+	unsigned int max = 0;
+	struct rq_flags src_rf;
+
+	if (IS_CFS_BL_ENABLED)
+		return newidle_balance_cfs(this_rq, rf);
+
+	/*
+	 * We must set idle_stamp _before_ calling idle_balance(), such that we
+	 * measure the duration of idle_balance() as idle time.
+	 */
+	this_rq->idle_stamp = rq_clock(this_rq);
+
+	/*
+	 * Do not pull tasks towards !active CPUs...
+	 */
+	if (IS_PWR_BL_ENABLED || !cpu_active(this_cpu))
+		return 0;
+
+	rq_unpin_lock(this_rq, rf);
+	raw_spin_unlock(&this_rq->__lock);
+
+	update_blocked_averages(this_cpu);
+
+	if (IS_CAND_BL_ENABLED) {
+		pulled_task = idle_pull_global_candidate(this_rq);
+		if (pulled_task)
+			goto out;
+	}
+
+	for_each_online_cpu(cpu) {
+		/*
+		 * Stop searching for tasks to pull if there are
+		 * now runnable tasks on this rq.
+		 */
+		if (this_rq->nr_running > 0)
+			goto out;
+
+		if (cpu == this_cpu)
+			continue;
+
+		src_rq = cpu_rq(cpu);
+
+		if (src_rq->nr_running < 2)
+			continue;
+
+		if (src_rq->nr_running > max) {
+			max = src_rq->nr_running;
+			src_cpu = cpu;
+		}
+	}
+
+	if (src_cpu != -1) {
+		src_rq = cpu_rq(src_cpu);
+
+		rq_lock_irqsave(src_rq, &src_rf);
+		update_rq_clock(src_rq);
+
+		if (src_rq->nr_running < 2) {
+			rq_unlock(src_rq, &src_rf);
+			local_irq_restore(src_rf.flags);
+		} else {
+			pulled_task = move_task(this_rq, src_rq, &src_rf);
+		}
+	}
+
+out:
+	raw_spin_lock(&this_rq->__lock);
+
+	/*
+	 * While browsing the domains, we released the rq lock, a task could
+	 * have been enqueued in the meantime. Since we're not going idle,
+	 * pretend we pulled a task.
+	 */
+	if (this_rq->cfs.h_nr_running && !pulled_task)
+		pulled_task = 1;
+
+	/* Is there a task of a high priority class? */
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+		pulled_task = -1;
+
+	if (pulled_task)
+		this_rq->idle_stamp = 0;
+	else
+		nohz_newidle_balance(this_rq);
+
+	rq_repin_lock(this_rq, rf);
+
+	return pulled_task;
+}
+#endif
+
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -11910,7 +13254,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 
 	/* normal load balance */
 	update_blocked_averages(this_rq->cpu);
-	rebalance_domains(this_rq, idle);
+
+#ifdef CONFIG_TT_SCHED
+	if (IS_CFS_BL_ENABLED)
+#endif
+		rebalance_domains(this_rq, idle);
 }
 
 /*
@@ -11918,6 +13266,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
  */
 void trigger_load_balance(struct rq *rq)
 {
+#ifdef CONFIG_TT_SCHED
+	struct rq *this_rq = rq;
+	int this_cpu = cpu_of(this_rq);
+	int cpu;
+	unsigned int max, min;
+	struct rq *max_rq, *min_rq, *c_rq;
+	struct rq_flags src_rf;
+#endif
 	/*
 	 * Don't need to rebalance while attached to NULL domain or
 	 * runqueue CPU is not active
@@ -11925,6 +13281,68 @@ void trigger_load_balance(struct rq *rq)
 	if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
 		return;
 
+#ifdef CONFIG_TT_SCHED
+	if (IS_CFS_BL_ENABLED)
+		goto cfs_balance;
+
+	if (this_cpu != 0)
+		goto out;
+
+	if (IS_CAND_BL_ENABLED)
+		nohz_try_pull_from_candidate();
+
+	max = min = this_rq->nr_running;
+	max_rq = min_rq = this_rq;
+
+	for_each_online_cpu(cpu) {
+		c_rq = cpu_rq(cpu);
+
+		/*
+		 * Don't need to rebalance while attached to NULL domain or
+		 * runqueue CPU is not active
+		 */
+		if (unlikely(on_null_domain(c_rq) || !cpu_active(cpu)))
+			continue;
+
+		if (c_rq->nr_running < min) {
+			min = c_rq->nr_running;
+			min_rq = c_rq;
+		}
+
+		if (c_rq->nr_running > max) {
+			max = c_rq->nr_running;
+			max_rq = c_rq;
+		}
+	}
+
+	if (min_rq == max_rq || max - min < 2)
+		goto out;
+
+	rq_lock_irqsave(max_rq, &src_rf);
+	update_rq_clock(max_rq);
+
+	if (max_rq->nr_running < 2) {
+		rq_unlock(max_rq, &src_rf);
+		local_irq_restore(src_rf.flags);
+		goto out;
+	}
+
+	if (IS_PWR_BL_ENABLED && idle_cpu(cpu_of(min_rq)) && max - min == 2)
+		move_task_powersave(min_rq, max_rq, &src_rf);
+	else
+		move_task(min_rq, max_rq, &src_rf);
+
+out:
+	if (time_after_eq(jiffies, this_rq->next_balance)) {
+		this_rq->next_balance = jiffies + msecs_to_jiffies(19);
+		update_blocked_averages(this_rq->cpu);
+	}
+
+	nohz_balancer_kick(rq);
+	return;
+
+cfs_balance:
+#endif
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 
@@ -12114,6 +13532,19 @@ static void task_fork_fair(struct task_struct *p)
 	struct rq *rq = this_rq();
 	struct rq_flags rf;
 
+#ifdef CONFIG_TT_SCHED
+	struct tt_node *ttn = &p->se.tt_node;
+
+	ttn->task_type		= TT_NO_TYPE;
+	ttn->vruntime		= 1;
+	ttn->prev_wait_time	= 0;
+	ttn->wait_time		= 0;
+	ttn->prev_burst		= 0;
+	ttn->burst		= 0;
+	ttn->curr_burst		= 0;
+	ttn->rt_sticky		= 0;
+#endif
+
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
 
@@ -12125,7 +13556,7 @@ static void task_fork_fair(struct task_struct *p)
 	}
 	place_entity(cfs_rq, se, 1);
 
-	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+	if (sysctl_sched_child_runs_first && curr && entity_before(&curr->tt_node, &se->tt_node)) {
 		/*
 		 * Upon rescheduling, sched_class::put_prev_task() will place
 		 * 'current' within the tree based on its new key value.
@@ -12340,6 +13771,11 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
 	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+
+#ifdef CONFIG_TT_SCHED
+	cfs_rq->dedicated_cpu_bound = NULL;
+#endif
+
 #ifdef CONFIG_SMP
 	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e9ef66be2870..d74001e371e0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -238,6 +238,13 @@ static void do_idle(void)
 {
 	int cpu = smp_processor_id();
 
+#ifdef CONFIG_TT_SCHED
+	int pm_disabled = per_cpu(nr_lat_sensitive, cpu) * tt_lat_sens_enabled;
+
+	if (IS_PWR_BL_ENABLED)
+		pm_disabled = 0;
+#endif
+
 	/*
 	 * Check if we need to update blocked load
 	 */
@@ -275,7 +282,11 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+		if (cpu_idle_force_poll || tick_check_broadcast_expired()
+#ifdef CONFIG_TT_SCHED
+		|| pm_disabled > 0
+#endif
+		) {
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 70abce91b549..93226c928dbb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -97,6 +97,23 @@
 # define SCHED_WARN_ON(x)      ({ (void)(x), 0; })
 #endif
 
+#ifdef CONFIG_TT_SCHED
+#define TT_REALTIME	0
+#define TT_INTERACTIVE	1
+#define TT_NO_TYPE	2
+#define TT_CPU_BOUND	3
+#define TT_BATCH	4
+
+#define TT_BL_NORM	0
+#define TT_BL_CAND	1
+#define TT_BL_CFS	2
+#define TT_BL_PWR	3
+
+#define IS_CAND_BL_ENABLED	(tt_balancer_opt == TT_BL_CAND)
+#define IS_CFS_BL_ENABLED	(tt_balancer_opt == TT_BL_CFS)
+#define IS_PWR_BL_ENABLED	(tt_balancer_opt == TT_BL_PWR)
+#endif
+
 struct rq;
 struct cpuidle_state;
 
@@ -111,6 +128,17 @@ extern atomic_long_t calc_load_tasks;
 
 extern unsigned int sysctl_sched_child_runs_first;
 
+#ifdef CONFIG_TT_SCHED
+extern unsigned int tt_balancer_opt;
+extern unsigned int tt_max_lifetime;
+extern int tt_rt_prio;
+extern int tt_interactive_prio;
+extern int tt_cpu_bound_prio;
+extern int tt_batch_prio;
+extern int tt_lat_sens_enabled;
+extern int tt_dedicated_cpu_bound_enabled;
+#endif
+
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
@@ -217,6 +245,15 @@ static inline int task_has_dl_policy(struct task_struct *p)
 	return dl_policy(p->policy);
 }
 
+#ifdef CONFIG_TT_SCHED
+static inline int task_is_lat_sensitive(struct task_struct *p)
+{
+	unsigned int tt = p->se.tt_node.task_type;
+
+	return (tt <= TT_INTERACTIVE);
+}
+#endif
+
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 static inline void update_avg(u64 *avg, u64 sample)
@@ -572,6 +609,12 @@ struct cfs_rq {
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity	*curr;
+#ifdef CONFIG_TT_SCHED
+	struct tt_node		*head;
+	struct tt_node		*dedicated_cpu_bound;
+	u64			local_cand_hrrn;
+#endif /* CONFIG_TT_SCHED */
+
 	struct sched_entity	*next;
 	struct sched_entity	*last;
 	struct sched_entity	*skip;
@@ -1829,6 +1872,9 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+#ifdef CONFIG_TT_SCHED
+DECLARE_PER_CPU(int, nr_lat_sensitive);
+#endif
 extern struct static_key_false sched_asym_cpucapacity;
 
 static __always_inline bool sched_asym_cpucap_active(void)
@@ -2325,6 +2371,10 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
+#ifdef CONFIG_TT_SCHED
+extern int idle_pull_global_candidate(struct rq *dist_rq);
+#endif
+
 extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
 
 static inline struct task_struct *get_push_task(struct rq *rq)
@@ -2475,6 +2525,11 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 #define SCHED_NR_MIGRATE_BREAK 32
 #endif
 
+#ifdef CONFIG_TT_SCHED
+extern inline void inc_nr_lat_sensitive(unsigned int cpu);
+extern inline void dec_nr_lat_sensitive(unsigned int cpu);
+#endif
+
 extern const_debug unsigned int sysctl_sched_nr_migrate;
 extern const_debug unsigned int sysctl_sched_migration_cost;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98e1a7472fd2..382f80026337 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -83,7 +83,12 @@
 #endif
 
 /* shared constants to be used in various sysctls */
+#ifdef CONFIG_TT_SCHED
+const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1, -20, 19 };
+#else
 const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
+#endif
+
 EXPORT_SYMBOL(sysctl_vals);
 
 const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
-- 
2.40.1