From 2081d38ee9220283ce3cc986db324dfa6e0264e2 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 5 May 2022 20:22:12 +0200 Subject: [PATCH 12/17] xanmod Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 6 +- block/mq-deadline.c | 2 +- init/Kconfig | 12 + kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 4 +- kernel/sched/autogroup.c | 10 +- kernel/sched/wait.c | 1 + net/ipv4/netfilter/Kconfig | 9 + net/mac80211/iface.c | 3 +- net/netfilter/Kconfig | 8 + net/netfilter/Makefile | 3 + net/netfilter/xt_FULLCONENAT.c | 1406 +++++++++++++++++ 13 files changed, 1456 insertions(+), 12 deletions(-) create mode 100644 net/netfilter/xt_FULLCONENAT.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 42d0b47abfaf..dee10d627153 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -376,6 +376,10 @@ autoconf= [IPV6] See Documentation/networking/ipv6.rst. + autogroup= [KNL] Enable or disable scheduler automatic task group + creation. + Format: + show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller Limit apic dumping. The parameter defines the maximal number of local apics being dumped. Also it is possible @@ -3365,8 +3369,6 @@ noapic [SMP,APIC] Tells the kernel to not make use of any IOAPICs that may be present in the system. - noautogroup Disable scheduler automatic task group creation. - nobats [PPC] Do not use BATs for mapping kernel lowmem on "Classic" PPC cores. diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 32a6c35de2f0..625778c83799 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -635,7 +635,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; - dd->front_merges = 1; + dd->front_merges = 0; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; diff --git a/init/Kconfig b/init/Kconfig index 40e86924d221..d5aebf11d2ae 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1294,6 +1294,18 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_AUTOGROUP_DEFAULT_ENABLED + bool "Enable automatic process group scheduling feature" + default y + depends on SCHED_AUTOGROUP + help + If set, automatic process group scheduling will be enabled per + default but can be disabled through passing autogroup=0 on the + kernel commandline during boot or a value of 0 via the file + proc/sys/kernel/sched_autogroup_enabled. + + If unsure say Y. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 422f7e4cc08d..e61bb8bcbcb2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3282,7 +3282,7 @@ rcu_torture_init(void) t = per_cpu(ksoftirqd, cpu); WARN_ON_ONCE(!t); sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5dae0da879ae..71b8522fc0d2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4466,7 +4466,7 @@ static int __init rcu_spawn_gp_kthread(void) return 0; if (kthread_prio) { sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5678bee7aefe..dfa39cc105b0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1000,7 +1000,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) struct sched_param sp; sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(current, SCHED_RR, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ } @@ -1186,7 +1186,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 8629b37d118e..e6c8d28a85ca 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -5,7 +5,8 @@ #include #include "sched.h" -unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +unsigned int __read_mostly sysctl_sched_autogroup_enabled = + IS_ENABLED(CONFIG_SCHED_AUTOGROUP_DEFAULT_ENABLED) ? 1 : 0; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; @@ -197,11 +198,12 @@ void sched_autogroup_exit(struct signal_struct *sig) static int __init setup_autogroup(char *str) { - sysctl_sched_autogroup_enabled = 0; - + unsigned long enabled; + if (!kstrtoul(str, 0, &enabled)) + sysctl_sched_autogroup_enabled = enabled ? 1 : 0; return 1; } -__setup("noautogroup", setup_autogroup); +__setup("autogroup=", setup_autogroup); #ifdef CONFIG_PROC_FS diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0306fa23b4f8..7a674a911f7a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -255,6 +255,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head) /* POLLFREE must have cleared the queue. */ WARN_ON_ONCE(waitqueue_active(wq_head)); } +EXPORT_SYMBOL(__wake_up_pollfree); /* * Note: we use "set_current_state()" _after_ the wait-queue add, diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index aab384126f61..83ecd0e61bbf 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -237,6 +237,15 @@ config IP_NF_TARGET_NETMAP (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_NETMAP. +config IP_NF_TARGET_FULLCONENAT + tristate "FULLCONENAT target support" + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_FULLCONENAT + help + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_FULLCONENAT. + config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" depends on NETFILTER_ADVANCED diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 41531478437c..b0a941c4f46d 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -63,7 +63,8 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) if (sdata->user_power_level != IEEE80211_UNSET_POWER_LEVEL) power = min(power, sdata->user_power_level); - if (sdata->ap_power_level != IEEE80211_UNSET_POWER_LEVEL) + if (sdata->ap_power_level != IEEE80211_UNSET_POWER_LEVEL && + sdata->vif.bss_conf.txpower_type != NL80211_TX_POWER_FIXED) power = min(power, sdata->ap_power_level); if (power != sdata->vif.bss_conf.txpower) { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ddc54b6d18ee..8af6d1f8dbb4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -988,6 +988,14 @@ config NETFILTER_XT_TARGET_NETMAP To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_FULLCONENAT + tristate '"FULLCONENAT" target support' + depends on NF_NAT + help + Full Cone NAT + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' default m if NETFILTER_ADVANCED=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index a135b1a46014..6003882708b8 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -214,3 +214,6 @@ obj-$(CONFIG_IP_VS) += ipvs/ # lwtunnel obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o + +# FullconeNAT +obj-$(CONFIG_NETFILTER_XT_TARGET_FULLCONENAT) += xt_FULLCONENAT.o diff --git a/net/netfilter/xt_FULLCONENAT.c b/net/netfilter/xt_FULLCONENAT.c new file mode 100644 index 000000000000..7bc0baedd47b --- /dev/null +++ b/net/netfilter/xt_FULLCONENAT.c @@ -0,0 +1,1406 @@ +/* + * Copyright (c) 2018 Chion Tang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +#include +#endif +#include +#include +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) +#include +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +#include +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) +#include +#endif +#endif + +#define HASH_2(x, y) ((x + y) / 2 * (x + y + 1) + y) + +#define HASHTABLE_BUCKET_BITS 10 + +#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY +#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) + +static inline int nf_ct_netns_get(struct net *net, u8 nfproto) { return 0; } + +static inline void nf_ct_netns_put(struct net *net, u8 nfproto) {} + +static inline struct net_device *xt_in(const struct xt_action_param *par) { + return (struct net_device *)par->in; +} + +static inline struct net_device *xt_out(const struct xt_action_param *par) { + return (struct net_device *)par->out; +} + +static inline unsigned int xt_hooknum(const struct xt_action_param *par) { + return par->hooknum; +} + +#endif + +struct nat_mapping_original_tuple { + struct nf_conntrack_tuple tuple; + + struct list_head node; +}; + +struct nat_mapping { + uint16_t port; /* external source port */ + __be32 addr; /* external source ip address */ + + __be32 int_addr; /* internal source ip address */ + uint16_t int_port; /* internal source port */ + + int refer_count; /* how many references linked to this mapping + * aka. length of original_tuple_list */ + + struct list_head original_tuple_list; + + struct hlist_node node_by_ext_port; + struct hlist_node node_by_int_src; + +}; + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +struct nat_mapping6 { + uint16_t port; /* external source port */ + union nf_inet_addr addr; /* external source ip address */ + + union nf_inet_addr int_addr; /* internal source ip address */ + uint16_t int_port; /* internal source port */ + + int refer_count; /* how many references linked to this mapping + * aka. length of original_tuple_list */ + + struct list_head original_tuple_list; + + struct hlist_node node_by_ext_port; + struct hlist_node node_by_int_src; + +}; +#endif + +struct tuple_list { + struct nf_conntrack_tuple tuple_original; + struct nf_conntrack_tuple tuple_reply; + struct list_head list; +}; + +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +struct notifier_block ct_event_notifier; +#else +struct nf_ct_event_notifier ct_event_notifier; +#endif +int tg_refer_count = 0; +int ct_event_notifier_registered = 0; + +static DEFINE_MUTEX(nf_ct_net_event_lock); + +static DEFINE_HASHTABLE(mapping_table_by_ext_port, HASHTABLE_BUCKET_BITS); +static DEFINE_HASHTABLE(mapping_table_by_int_src, HASHTABLE_BUCKET_BITS); + +static DEFINE_SPINLOCK(fullconenat_lock); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static DEFINE_HASHTABLE(mapping6_table_by_ext_port, HASHTABLE_BUCKET_BITS); +static DEFINE_HASHTABLE(mapping6_table_by_int_src, HASHTABLE_BUCKET_BITS); + +static DEFINE_SPINLOCK(fullconenat6_lock); +#endif + +static LIST_HEAD(dying_tuple_list); +static DEFINE_SPINLOCK(dying_tuple_list_lock); +static void gc_worker(struct work_struct *work); +static struct workqueue_struct *wq __read_mostly = NULL; +static DECLARE_DELAYED_WORK(gc_worker_wk, gc_worker); + +static char tuple_tmp_string[512]; + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +/* non-atomic: can only be called serially within lock zones. */ +static char* nf_ct_stringify_tuple6(const struct nf_conntrack_tuple *t) { + snprintf(tuple_tmp_string, sizeof(tuple_tmp_string), "[%pI6c]:%hu -> [%pI6c]:%hu", + &t->src.u3.ip6, be16_to_cpu(t->src.u.all), + &t->dst.u3.ip6, be16_to_cpu(t->dst.u.all)); + return tuple_tmp_string; +} + +static struct nat_mapping6* allocate_mapping6(const union nf_inet_addr *int_addr, const uint16_t int_port, const uint16_t port, const union nf_inet_addr *addr) { + struct nat_mapping6 *p_new; + u32 hash_src; + + p_new = kmalloc(sizeof(struct nat_mapping6), GFP_ATOMIC); + if (p_new == NULL) { + pr_debug("xt_FULLCONENAT: ERROR: kmalloc() for new nat_mapping failed.\n"); + return NULL; + } + p_new->addr = *addr; + p_new->port = port; + p_new->int_addr = *int_addr; + p_new->int_port = int_port; + p_new->refer_count = 0; + (p_new->original_tuple_list).next = &(p_new->original_tuple_list); + (p_new->original_tuple_list).prev = &(p_new->original_tuple_list); + + hash_src = jhash2((u32 *)int_addr->all, 4, (u32)int_port); + + hash_add(mapping6_table_by_ext_port, &p_new->node_by_ext_port, port); + hash_add(mapping6_table_by_int_src, &p_new->node_by_int_src, hash_src); + + pr_debug("xt_FULLCONENAT: new mapping allocated for [%pI6c]:%d ==> [%pI6c]:%d\n", + &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port); + + return p_new; +} + +static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple* original_tuple) { + struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC); + if (item == NULL) { + pr_debug("xt_FULLCONENAT: ERROR: kmalloc() for nat_mapping_original_tuple failed.\n"); + return; + } + memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple)); + list_add(&item->node, &mapping->original_tuple_list); + (mapping->refer_count)++; +} + +static struct nat_mapping6* get_mapping6_by_int_src(const union nf_inet_addr *src_ip, const uint16_t src_port, const union nf_inet_addr *ext_ip) { + struct nat_mapping6 *p_current; + u32 hash_src = jhash2((u32 *)src_ip->all, 4, (u32)src_port); + + hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (nf_inet_addr_cmp(&p_current->int_addr, src_ip) && p_current->int_port == src_port && nf_inet_addr_cmp(&p_current->addr, ext_ip)) { + return p_current; + } + } + + return NULL; +} + +static struct nat_mapping6* get_mapping6_by_int_src_inrange(const union nf_inet_addr *src_ip, const uint16_t src_port, const union nf_inet_addr *min_ip, const union nf_inet_addr *max_ip) { + struct nat_mapping6 *p_current; + u32 hash_src = jhash2((u32 *)src_ip->all, 4, (u32)src_port); + + hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (nf_inet_addr_cmp(&p_current->int_addr, src_ip) && p_current->int_port == src_port && memcmp(&p_current->addr, min_ip, sizeof(union nf_inet_addr)) >= 0 && memcmp(&p_current->addr, max_ip, sizeof(union nf_inet_addr)) <= 0) { + return p_current; + } + } + + return NULL; +} + +static void kill_mapping6(struct nat_mapping6 *mapping) { + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + + if (mapping == NULL) { + return; + } + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + } + + hash_del(&mapping->node_by_ext_port); + hash_del(&mapping->node_by_int_src); + kfree(mapping); +} + +/* check if a mapping is valid. + * possibly delete and free an invalid mapping. + * the mapping should not be used anymore after check_mapping6() returns 0. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static int check_mapping6(struct nat_mapping6* mapping, struct net *net, const struct nf_conntrack_zone *zone) { +#else +static int check_mapping6(struct nat_mapping6* mapping, struct net *net, const u16 zone) { +#endif + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + struct nf_conntrack_tuple_hash *tuple_hash; + struct nf_conn *ct; + + /* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired. + * so we manually kill one of those tuples once we acquire one. */ + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + + tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple); + + if (tuple_hash == NULL) { + pr_debug("xt_FULLCONENAT: check_mapping6(): tuple %s dying/unconfirmed. free this tuple.\n", nf_ct_stringify_tuple6(&original_tuple_item->tuple)); + + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping->refer_count)--; + } else { + ct = nf_ct_tuplehash_to_ctrack(tuple_hash); + if (likely(ct != NULL)) + nf_ct_put(ct); + } + + } + + /* kill the mapping if need */ + pr_debug("xt_FULLCONENAT: check_mapping6() refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + if (mapping->refer_count <= 0) { + pr_debug("xt_FULLCONENAT: check_mapping6(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port); + kill_mapping6(mapping); + return 0; + } else { + return 1; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static struct nat_mapping6* get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr *ext_ip, struct net *net, const struct nf_conntrack_zone *zone) { +#else +static struct nat_mapping6* get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr *ext_ip, struct net *net, const u16 zone) { +#endif + struct nat_mapping6 *p_current; + struct hlist_node *tmp; + + hash_for_each_possible_safe(mapping6_table_by_ext_port, p_current, tmp, node_by_ext_port, port) { + if (p_current->port == port && check_mapping6(p_current, net, zone) && nf_inet_addr_cmp(&p_current->addr, ext_ip)) { + return p_current; + } + } + + return NULL; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static uint16_t find_appropriate_port6(struct net *net, const struct nf_conntrack_zone *zone, const uint16_t original_port, const union nf_inet_addr *ext_ip, const struct nf_nat_range *range) { +#else +static uint16_t find_appropriate_port6(struct net *net, const u16 zone, const uint16_t original_port, const union nf_inet_addr *ext_ip, const struct nf_nat_range *range) { +#endif + uint16_t min, start, selected, range_size, i; + struct nat_mapping6* mapping = NULL; + + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + min = be16_to_cpu((range->min_proto).udp.port); + range_size = be16_to_cpu((range->min_proto).udp.port) - min + 1; + } else { + /* minimum port is 1024. same behavior as default linux NAT. */ + min = 1024; + range_size = 65535 - min + 1; + } + + if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM) + || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) { + /* for now we do the same thing for both --random and --random-fully */ + + /* select a random starting point */ + start = (uint16_t)(prandom_u32() % (u32)range_size); + } else { + + if ((original_port >= min && original_port <= min + range_size - 1) + || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* 1. try to preserve the port if it's available */ + mapping = get_mapping6_by_ext_port(original_port, ext_ip, net, zone); + if (mapping == NULL) { + return original_port; + } + } + + /* otherwise, we start from zero */ + start = 0; + } + + for (i = 0; i < range_size; i++) { + /* 2. try to find an available port */ + selected = min + ((start + i) % range_size); + mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone); + if (mapping == NULL) { + return selected; + } + } + + /* 3. at least we tried. override a previous mapping. */ + selected = min + start; + mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone); + kill_mapping6(mapping); + + return selected; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static void find_leastused_ip6(const struct nf_conntrack_zone *zone, const struct nf_nat_range *range, const union nf_inet_addr *src, const union nf_inet_addr *dst, union nf_inet_addr *var_ipp) +#else +static void find_leastused_ip6(const u16 zone, const struct nf_nat_range *range, const union nf_inet_addr *src, const union nf_inet_addr *dst, union nf_inet_addr *var_ipp) +#endif +{ + unsigned int i; + /* Host order */ + u32 minip, maxip, j, dist; + bool full_range; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + j = jhash2((u32 *)src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id); +#else + j = jhash2((u32 *)src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone); +#endif + + full_range = false; + for (i = 0; i <= 3; i++) { + /* If first bytes of the address are at the maximum, use the + * distance. Otherwise use the full range. */ + if (!full_range) { + minip = ntohl(range->min_addr.all[i]); + maxip = ntohl(range->max_addr.all[i]); + dist = maxip - minip + 1; + } else { + minip = 0; + dist = ~0; + } + + var_ipp->all[i] = (__force __be32) htonl(minip + reciprocal_scale(j, dist)); + if (var_ipp->all[i] != range->max_addr.all[i]) + full_range = true; + + if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) + j ^= (__force u32)dst->all[i]; + } +} + +static unsigned int fullconenat_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + const struct nf_conntrack_zone *zone; +#else + u16 zone; +#endif + struct net *net; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + struct nf_conntrack_tuple *ct_tuple, *ct_tuple_origin; + + struct nat_mapping6 *mapping, *src_mapping; + unsigned int ret; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 newrange; +#else + struct nf_nat_range newrange; +#endif + + union nf_inet_addr *ip; + uint16_t port, original_port, want_port; + uint8_t protonum; + + ip = NULL; + original_port = 0; + src_mapping = NULL; + + range = par->targinfo; + + mapping = NULL; + ret = XT_CONTINUE; + + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); + zone = nf_ct_zone(ct); + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + if (xt_hooknum(par) == NF_INET_PRE_ROUTING) { + /* inbound packets */ + ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + protonum = (ct_tuple_origin->dst).protonum; + if (protonum != IPPROTO_UDP) { + return ret; + } + ip = &(ct_tuple_origin->dst).u3; + port = be16_to_cpu((ct_tuple_origin->dst).u.udp.port); + + spin_lock_bh(&fullconenat6_lock); + + /* find an active mapping based on the inbound port */ + mapping = get_mapping6_by_ext_port(port, ip, net, zone); + if (mapping != NULL) { + newrange.flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange.min_addr = mapping->int_addr; + newrange.max_addr = mapping->int_addr; + newrange.min_proto.udp.port = cpu_to_be16(mapping->int_port); + newrange.max_proto = newrange.min_proto; + + pr_debug("xt_FULLCONENAT: %s ==> [%pI6c]:%d\n", nf_ct_stringify_tuple6(ct_tuple_origin), &mapping->int_addr, mapping->int_port); + + ret = nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); + + if (ret == NF_ACCEPT) { + add_original_tuple_to_mapping6(mapping, ct_tuple_origin); + pr_debug("xt_FULLCONENAT: fullconenat_tg6(): INBOUND: refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + } + } + spin_unlock_bh(&fullconenat6_lock); + return ret; + + } else if (xt_hooknum(par) == NF_INET_POST_ROUTING) { + /* outbound packets */ + ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + protonum = (ct_tuple_origin->dst).protonum; + + if(range->flags & NF_NAT_RANGE_MAP_IPS) { + newrange.min_addr = range->min_addr; + newrange.max_addr = range->max_addr; + } else { + if (unlikely(ipv6_dev_get_saddr(nf_ct_net(ct), xt_out(par), &ipv6_hdr(skb)->daddr, 0, (struct in6_addr*)&newrange.min_addr) < 0)) + return NF_DROP; + newrange.max_addr = newrange.min_addr; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) + nat = nf_ct_nat_ext_add(ct); +#else + nat = nfct_nat(ct); +#endif + if (likely(nat)) + nat->masq_index = xt_out(par)->ifindex; + + } + + if (protonum == IPPROTO_UDP) { + ip = &(ct_tuple_origin->src).u3; + original_port = be16_to_cpu((ct_tuple_origin->src).u.udp.port); + + spin_lock_bh(&fullconenat6_lock); + + if (!nf_inet_addr_cmp(&newrange.min_addr, &newrange.max_addr)) + src_mapping = get_mapping6_by_int_src_inrange(ip, original_port, &newrange.min_addr, &newrange.max_addr); + else + src_mapping = get_mapping6_by_int_src(ip, original_port, &newrange.min_addr); + + if (src_mapping != NULL && check_mapping6(src_mapping, net, zone)) { + + /* outbound nat: if a previously established mapping is active, + * we will reuse that mapping. */ + + newrange.flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange.min_proto.udp.port = cpu_to_be16(src_mapping->port); + newrange.max_proto = newrange.min_proto; + if (!nf_inet_addr_cmp(&newrange.min_addr, &newrange.max_addr)) { + newrange.min_addr = src_mapping->addr; + newrange.max_addr = newrange.min_addr; + } + + } else { + /* if not, we find a new external IP:port to map to. + * the SNAT may fail so we should re-check the mapped port later. */ + if (!nf_inet_addr_cmp(&newrange.min_addr, &newrange.max_addr)) { + find_leastused_ip6(zone, range, ip, &(ct_tuple_origin->dst).u3, &newrange.min_addr); + newrange.max_addr = newrange.min_addr; + } + + want_port = find_appropriate_port6(net, zone, original_port, &newrange.min_addr, range); + + newrange.flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange.min_proto.udp.port = cpu_to_be16(want_port); + newrange.max_proto = newrange.min_proto; + + src_mapping = NULL; + + } + } + + /* do SNAT now */ + ret = nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); + + if (protonum != IPPROTO_UDP) { + /* non-UDP packets, bailout */ + return ret; + } + if (ret != NF_ACCEPT) { + /* failed SNAT, bailout */ + spin_unlock_bh(&fullconenat6_lock); + return ret; + } + + /* the reply tuple contains the mapped port. */ + ct_tuple = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); + /* this is the resulted mapped port. */ + port = be16_to_cpu((ct_tuple->dst).u.udp.port); + + pr_debug("xt_FULLCONENAT: %s ==> %d\n", nf_ct_stringify_tuple6(ct_tuple_origin), port); + + /* save the mapping information into our mapping table */ + mapping = src_mapping; + if (mapping == NULL) { + mapping = allocate_mapping6(ip, original_port, port, &(ct_tuple->dst).u3); + } + if (likely(mapping != NULL)) { + add_original_tuple_to_mapping6(mapping, ct_tuple_origin); + pr_debug("xt_FULLCONENAT: fullconenat_tg6(): OUTBOUND: refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + } + + spin_unlock_bh(&fullconenat6_lock); + return ret; + } + + return ret; +} +#endif + +/* non-atomic: can only be called serially within lock zones. */ +static char* nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t) { + snprintf(tuple_tmp_string, sizeof(tuple_tmp_string), "%pI4:%hu -> %pI4:%hu", + &t->src.u3.ip, be16_to_cpu(t->src.u.all), + &t->dst.u3.ip, be16_to_cpu(t->dst.u.all)); + return tuple_tmp_string; +} + +static struct nat_mapping* allocate_mapping(const __be32 int_addr, const uint16_t int_port, const uint16_t port, const __be32 addr) { + struct nat_mapping *p_new; + u32 hash_src; + + p_new = kmalloc(sizeof(struct nat_mapping), GFP_ATOMIC); + if (p_new == NULL) { + pr_debug("xt_FULLCONENAT: ERROR: kmalloc() for new nat_mapping failed.\n"); + return NULL; + } + p_new->addr = addr; + p_new->port = port; + p_new->int_addr = int_addr; + p_new->int_port = int_port; + p_new->refer_count = 0; + (p_new->original_tuple_list).next = &(p_new->original_tuple_list); + (p_new->original_tuple_list).prev = &(p_new->original_tuple_list); + + hash_src = HASH_2(int_addr, (u32)int_port); + + hash_add(mapping_table_by_ext_port, &p_new->node_by_ext_port, port); + hash_add(mapping_table_by_int_src, &p_new->node_by_int_src, hash_src); + + pr_debug("xt_FULLCONENAT: new mapping allocated for %pI4:%d ==> %pI4:%d\n", + &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port); + + return p_new; +} + +static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple* original_tuple) { + struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC); + if (item == NULL) { + pr_debug("xt_FULLCONENAT: ERROR: kmalloc() for nat_mapping_original_tuple failed.\n"); + return; + } + memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple)); + list_add(&item->node, &mapping->original_tuple_list); + (mapping->refer_count)++; +} + +static struct nat_mapping* get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip) { + struct nat_mapping *p_current; + u32 hash_src = HASH_2(src_ip, (u32)src_port); + + hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (p_current->int_addr == src_ip && p_current->int_port == src_port && p_current->addr == ext_ip) { + return p_current; + } + } + + return NULL; +} + +static struct nat_mapping* get_mapping_by_int_src_inrange(const __be32 src_ip, const uint16_t src_port, const __be32 min_ip, const __be32 max_ip) { + struct nat_mapping *p_current; + u32 hash_src = HASH_2(src_ip, (u32)src_port); + + hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (p_current->int_addr == src_ip && p_current->int_port == src_port && memcmp(&p_current->addr, &min_ip, sizeof(__be32)) >=0 && memcmp(&p_current->addr, &max_ip, sizeof(__be32)) <= 0) { + return p_current; + } + } + + return NULL; +} + +static void kill_mapping(struct nat_mapping *mapping) { + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + + if (mapping == NULL) { + return; + } + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + } + + hash_del(&mapping->node_by_ext_port); + hash_del(&mapping->node_by_int_src); + kfree(mapping); +} + +static void destroy_mappings(void) { + struct nat_mapping *p_current; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + struct nat_mapping6 *p6_current; +#endif + struct hlist_node *tmp; + int i; + + spin_lock_bh(&fullconenat_lock); + + hash_for_each_safe(mapping_table_by_ext_port, i, tmp, p_current, node_by_ext_port) { + kill_mapping(p_current); + } + + spin_unlock_bh(&fullconenat_lock); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + spin_lock_bh(&fullconenat6_lock); + + hash_for_each_safe(mapping6_table_by_ext_port, i, tmp, p6_current, node_by_ext_port) { + kill_mapping6(p6_current); + } + + spin_unlock_bh(&fullconenat6_lock); +#endif +} + +/* check if a mapping is valid. + * possibly delete and free an invalid mapping. + * the mapping should not be used anymore after check_mapping() returns 0. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static int check_mapping(struct nat_mapping* mapping, struct net *net, const struct nf_conntrack_zone *zone) { +#else +static int check_mapping(struct nat_mapping* mapping, struct net *net, const u16 zone) { +#endif + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + struct nf_conntrack_tuple_hash *tuple_hash; + struct nf_conn *ct; + + /* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired. + * so we manually kill one of those tuples once we acquire one. */ + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + + tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple); + + if (tuple_hash == NULL) { + pr_debug("xt_FULLCONENAT: check_mapping(): tuple %s dying/unconfirmed. free this tuple.\n", nf_ct_stringify_tuple(&original_tuple_item->tuple)); + + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping->refer_count)--; + } else { + ct = nf_ct_tuplehash_to_ctrack(tuple_hash); + if (likely(ct != NULL)) + nf_ct_put(ct); + } + + } + + /* kill the mapping if need */ + pr_debug("xt_FULLCONENAT: check_mapping() refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + if (mapping->refer_count <= 0) { + pr_debug("xt_FULLCONENAT: check_mapping(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port); + kill_mapping(mapping); + return 0; + } else { + return 1; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static struct nat_mapping* get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct nf_conntrack_zone *zone) { +#else +static struct nat_mapping* get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const u16 zone) { +#endif + struct nat_mapping *p_current; + struct hlist_node *tmp; + + hash_for_each_possible_safe(mapping_table_by_ext_port, p_current, tmp, node_by_ext_port, port) { + if (p_current->port == port && check_mapping(p_current, net, zone) && p_current->addr == ext_ip) { + return p_current; + } + } + + return NULL; +} + +static void handle_dying_tuples(void) { + struct list_head *iter, *tmp, *iter_2, *tmp_2; + struct tuple_list *item; + struct nf_conntrack_tuple *ct_tuple; + struct nat_mapping *mapping; + __be32 ip, ext_ip; + uint16_t port; + struct nat_mapping_original_tuple *original_tuple_item; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + struct nat_mapping6 *mapping6; + union nf_inet_addr *ip6, *ext_ip6; + spin_lock_bh(&fullconenat6_lock); +#endif + + spin_lock_bh(&fullconenat_lock); + spin_lock_bh(&dying_tuple_list_lock); + + list_for_each_safe(iter, tmp, &dying_tuple_list) { + item = list_entry(iter, struct tuple_list, list); + + /* we dont know the conntrack direction for now so we try in both ways. */ + ct_tuple = &(item->tuple_original); +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + if (ct_tuple->src.l3num == PF_INET6) { + ip6 = &(ct_tuple->src).u3; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + ext_ip6 = &item->tuple_reply.dst.u3; + mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6); + if (mapping6 == NULL) { + ext_ip6 = &(ct_tuple->dst).u3; + ct_tuple = &(item->tuple_reply); + ip6 = &(ct_tuple->src).u3; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6); + if (mapping6 != NULL) { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n", mapping6->port); + } + } else { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n", mapping6->port); + } + + if (mapping6 == NULL) { + goto next; + } + + /* look for the corresponding out-dated tuple and free it */ + list_for_each_safe(iter_2, tmp_2, &mapping6->original_tuple_list) { + original_tuple_item = list_entry(iter_2, struct nat_mapping_original_tuple, node); + + if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): tuple %s expired. free this tuple.\n", + nf_ct_stringify_tuple6(&original_tuple_item->tuple)); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping6->refer_count)--; + } + } + + /* then kill the mapping if needed*/ + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n", mapping6->port, mapping6->refer_count); + if (mapping6->refer_count <= 0) { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): kill expired mapping at ext port %d\n", mapping6->port); + kill_mapping6(mapping6); + } + goto next; + } + if (unlikely(ct_tuple->src.l3num != PF_INET)) +#else + if (ct_tuple->src.l3num != PF_INET) +#endif + goto next; + + ip = (ct_tuple->src).u3.ip; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + ext_ip = item->tuple_reply.dst.u3.ip; + mapping = get_mapping_by_int_src(ip, port, ext_ip); + if (mapping == NULL) { + ext_ip = (ct_tuple->dst).u3.ip; + ct_tuple = &(item->tuple_reply); + ip = (ct_tuple->src).u3.ip; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + mapping = get_mapping_by_int_src(ip, port, ext_ip); + if (mapping != NULL) { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n", mapping->port); + } + } else { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n", mapping->port); + } + + if (mapping == NULL) { + goto next; + } + + /* look for the corresponding out-dated tuple and free it */ + list_for_each_safe(iter_2, tmp_2, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter_2, struct nat_mapping_original_tuple, node); + + if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): tuple %s expired. free this tuple.\n", + nf_ct_stringify_tuple(&original_tuple_item->tuple)); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping->refer_count)--; + } + } + + /* then kill the mapping if needed*/ + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + if (mapping->refer_count <= 0) { + pr_debug("xt_FULLCONENAT: handle_dying_tuples(): kill expired mapping at ext port %d\n", mapping->port); + kill_mapping(mapping); + } + +next: + list_del(&item->list); + kfree(item); + } + + spin_unlock_bh(&dying_tuple_list_lock); + spin_unlock_bh(&fullconenat_lock); +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + spin_unlock_bh(&fullconenat6_lock); +#endif +} + +static void gc_worker(struct work_struct *work) { + handle_dying_tuples(); +} + +/* conntrack destroy event callback function */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int ct_event_cb(struct notifier_block *this, unsigned long events, void *ptr) { + struct nf_ct_event *item = ptr; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) +static int ct_event_cb(unsigned int events, const struct nf_ct_event *item) { +#else +static int ct_event_cb(unsigned int events, struct nf_ct_event *item) { +#endif + struct nf_conn *ct; + struct nf_conntrack_tuple *ct_tuple_reply, *ct_tuple_original; + uint8_t protonum; + struct tuple_list *dying_tuple_item; + + ct = item->ct; + /* we handle only conntrack destroy events */ + if (ct == NULL || !(events & (1 << IPCT_DESTROY))) { + return 0; + } + + ct_tuple_original = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + ct_tuple_reply = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + protonum = (ct_tuple_original->dst).protonum; + if (protonum != IPPROTO_UDP) { + return 0; + } + + dying_tuple_item = kmalloc(sizeof(struct tuple_list), GFP_ATOMIC); + + if (dying_tuple_item == NULL) { + pr_debug("xt_FULLCONENAT: warning: ct_event_cb(): kmalloc failed.\n"); + return 0; + } + + memcpy(&(dying_tuple_item->tuple_original), ct_tuple_original, sizeof(struct nf_conntrack_tuple)); + memcpy(&(dying_tuple_item->tuple_reply), ct_tuple_reply, sizeof(struct nf_conntrack_tuple)); + + spin_lock_bh(&dying_tuple_list_lock); + + list_add(&(dying_tuple_item->list), &dying_tuple_list); + + spin_unlock_bh(&dying_tuple_list_lock); + + if (wq != NULL) + queue_delayed_work(wq, &gc_worker_wk, msecs_to_jiffies(100)); + + return 0; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) +static int exp_event_cb(unsigned int events, const struct nf_exp_event *item) { + return 0; +} +#endif + +static __be32 get_device_ip(const struct net_device* dev) { + struct in_device* in_dev; + struct in_ifaddr* if_info; + __be32 result; + + if (dev == NULL) { + return 0; + } + + rcu_read_lock(); + in_dev = dev->ip_ptr; + if (in_dev == NULL) { + rcu_read_unlock(); + return 0; + } + if_info = in_dev->ifa_list; + if (if_info) { + result = if_info->ifa_local; + rcu_read_unlock(); + return result; + } else { + rcu_read_unlock(); + return 0; + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static uint16_t find_appropriate_port(struct net *net, const struct nf_conntrack_zone *zone, const uint16_t original_port, const __be32 ext_ip, const struct nf_nat_ipv4_range *range) { +#else +static uint16_t find_appropriate_port(struct net *net, const u16 zone, const uint16_t original_port, const __be32 ext_ip, const struct nf_nat_ipv4_range *range) { +#endif + uint16_t min, start, selected, range_size, i; + struct nat_mapping* mapping = NULL; + + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + min = be16_to_cpu((range->min).udp.port); + range_size = be16_to_cpu((range->max).udp.port) - min + 1; + } else { + /* minimum port is 1024. same behavior as default linux NAT. */ + min = 1024; + range_size = 65535 - min + 1; + } + + if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM) + || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) { + /* for now we do the same thing for both --random and --random-fully */ + + /* select a random starting point */ + start = (uint16_t)(prandom_u32() % (u32)range_size); + } else { + + if ((original_port >= min && original_port <= min + range_size - 1) + || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* 1. try to preserve the port if it's available */ + mapping = get_mapping_by_ext_port(original_port, ext_ip, net, zone); + if (mapping == NULL) { + return original_port; + } + } + + /* otherwise, we start from zero */ + start = 0; + } + + for (i = 0; i < range_size; i++) { + /* 2. try to find an available port */ + selected = min + ((start + i) % range_size); + mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone); + if (mapping == NULL) { + return selected; + } + } + + /* 3. at least we tried. override a previous mapping. */ + selected = min + start; + mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone); + kill_mapping(mapping); + + return selected; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone, const struct nf_nat_ipv4_range *range, const __be32 src, const __be32 dst) +#else +static __be32 find_leastused_ip(const u16 zone, const struct nf_nat_ipv4_range *range, const __be32 src, const __be32 dst) +#endif +{ + /* Host order */ + u32 minip, maxip, j, dist; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + j = jhash_1word((u32)src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id); +#else + j = jhash_1word((u32)src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone); +#endif + + minip = ntohl(range->min_ip); + maxip = ntohl(range->max_ip); + dist = maxip - minip + 1; + + return (__be32) htonl(minip + reciprocal_scale(j, dist)); +} + +static unsigned int fullconenat_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_ipv4_multi_range_compat *mr; + const struct nf_nat_ipv4_range *range; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + const struct nf_conntrack_zone *zone; +#else + u16 zone; +#endif + struct net *net; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + struct nf_conntrack_tuple *ct_tuple, *ct_tuple_origin; + + struct nat_mapping *mapping, *src_mapping; + unsigned int ret; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 newrange; +#else + struct nf_nat_range newrange; +#endif + + __be32 ip; + uint16_t port, original_port, want_port; + uint8_t protonum; + + ip = 0; + original_port = 0; + src_mapping = NULL; + + mr = par->targinfo; + range = &mr->range[0]; + + mapping = NULL; + ret = XT_CONTINUE; + + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); + zone = nf_ct_zone(ct); + + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + if (xt_hooknum(par) == NF_INET_PRE_ROUTING) { + /* inbound packets */ + ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + protonum = (ct_tuple_origin->dst).protonum; + if (protonum != IPPROTO_UDP) { + return ret; + } + ip = (ct_tuple_origin->dst).u3.ip; + port = be16_to_cpu((ct_tuple_origin->dst).u.udp.port); + + spin_lock_bh(&fullconenat_lock); + + /* find an active mapping based on the inbound port */ + mapping = get_mapping_by_ext_port(port, ip, net, zone); + if (mapping != NULL) { + newrange.flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange.min_addr.ip = mapping->int_addr; + newrange.max_addr.ip = mapping->int_addr; + newrange.min_proto.udp.port = cpu_to_be16(mapping->int_port); + newrange.max_proto = newrange.min_proto; + + pr_debug("xt_FULLCONENAT: %s ==> %pI4:%d\n", nf_ct_stringify_tuple(ct_tuple_origin), &mapping->int_addr, mapping->int_port); + + ret = nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); + + if (ret == NF_ACCEPT) { + add_original_tuple_to_mapping(mapping, ct_tuple_origin); + pr_debug("xt_FULLCONENAT: fullconenat_tg(): INBOUND: refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + } + } + spin_unlock_bh(&fullconenat_lock); + return ret; + + + } else if (xt_hooknum(par) == NF_INET_POST_ROUTING) { + /* outbound packets */ + ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + protonum = (ct_tuple_origin->dst).protonum; + + if(range->flags & NF_NAT_RANGE_MAP_IPS) { + newrange.min_addr.ip = mr->range[0].min_ip; + newrange.max_addr.ip = mr->range[0].max_ip; + } else { + newrange.min_addr.ip = get_device_ip(skb->dev); + if (unlikely(!newrange.min_addr.ip)) + return NF_DROP; + newrange.max_addr.ip = newrange.min_addr.ip; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) + nat = nf_ct_nat_ext_add(ct); +#else + nat = nfct_nat(ct); +#endif + if (likely(nat)) + nat->masq_index = xt_out(par)->ifindex; + + } + + if (protonum == IPPROTO_UDP) { + ip = (ct_tuple_origin->src).u3.ip; + original_port = be16_to_cpu((ct_tuple_origin->src).u.udp.port); + + spin_lock_bh(&fullconenat_lock); + + if (newrange.min_addr.ip != newrange.max_addr.ip) + src_mapping = get_mapping_by_int_src_inrange(ip, original_port, newrange.min_addr.ip, newrange.max_addr.ip); + else + src_mapping = get_mapping_by_int_src(ip, original_port, newrange.min_addr.ip); + + if (src_mapping != NULL && check_mapping(src_mapping, net, zone)) { + + /* outbound nat: if a previously established mapping is active, + * we will reuse that mapping. */ + + newrange.flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange.min_proto.udp.port = cpu_to_be16(src_mapping->port); + newrange.max_proto = newrange.min_proto; + if (newrange.min_addr.ip != newrange.max_addr.ip) { + newrange.min_addr.ip = src_mapping->addr; + newrange.max_addr.ip = newrange.min_addr.ip; + } + + } else { + + /* if not, we find a new external IP:port to map to. + * the SNAT may fail so we should re-check the mapped port later. */ + if (newrange.min_addr.ip != newrange.max_addr.ip) { + newrange.min_addr.ip = find_leastused_ip(zone, range, ip, (ct_tuple_origin->dst).u3.ip); + newrange.max_addr.ip = newrange.min_addr.ip; + } + want_port = find_appropriate_port(net, zone, original_port, newrange.min_addr.ip, range); + + newrange.flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange.min_proto.udp.port = cpu_to_be16(want_port); + newrange.max_proto = newrange.min_proto; + + src_mapping = NULL; + + } + } + + /* do SNAT now */ + ret = nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); + + if (protonum != IPPROTO_UDP) { + /* non-UDP packets, bailout */ + return ret; + } + if (ret != NF_ACCEPT) { + /* failed SNAT, bailout */ + spin_unlock_bh(&fullconenat_lock); + return ret; + } + + /* the reply tuple contains the mapped port. */ + ct_tuple = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); + /* this is the resulted mapped port. */ + port = be16_to_cpu((ct_tuple->dst).u.udp.port); + + pr_debug("xt_FULLCONENAT: %s ==> %d\n", nf_ct_stringify_tuple(ct_tuple_origin), port); + + /* save the mapping information into our mapping table */ + mapping = src_mapping; + if (mapping == NULL) { + mapping = allocate_mapping(ip, original_port, port, (ct_tuple->dst).u3.ip); + } + if (likely(mapping != NULL)) { + add_original_tuple_to_mapping(mapping, ct_tuple_origin); + pr_debug("xt_FULLCONENAT: fullconenat_tg(): OUTBOUND: refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + } + + spin_unlock_bh(&fullconenat_lock); + return ret; + } + + return ret; +} + +static int fullconenat_tg_check(const struct xt_tgchk_param *par) +{ + nf_ct_netns_get(par->net, par->family); + + mutex_lock(&nf_ct_net_event_lock); + + tg_refer_count++; + + pr_debug("xt_FULLCONENAT: fullconenat_tg_check(): tg_refer_count is now %d\n", tg_refer_count); + + if (tg_refer_count == 1) { +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + ct_event_notifier.notifier_call = ct_event_cb; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + ct_event_notifier.ct_event = ct_event_cb; + ct_event_notifier.exp_event = exp_event_cb; +#else + ct_event_notifier.fcn = ct_event_cb; +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) + nf_conntrack_register_notifier(par->net, &ct_event_notifier); + if (true) { +#else + if (nf_conntrack_register_notifier(par->net, &ct_event_notifier) == 0) { +#endif + ct_event_notifier_registered = 1; + pr_debug("xt_FULLCONENAT: fullconenat_tg_check(): ct_event_notifier registered\n"); + } else { + printk("xt_FULLCONENAT: warning: failed to register a conntrack notifier. Disable active GC for mappings.\n"); + } + + } + + mutex_unlock(&nf_ct_net_event_lock); + + return 0; +} + +static void fullconenat_tg_destroy(const struct xt_tgdtor_param *par) +{ + mutex_lock(&nf_ct_net_event_lock); + + tg_refer_count--; + + pr_debug("xt_FULLCONENAT: fullconenat_tg_destroy(): tg_refer_count is now %d\n", tg_refer_count); + + if (tg_refer_count == 0) { + if (ct_event_notifier_registered) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) + nf_conntrack_unregister_notifier(par->net); +#else + nf_conntrack_unregister_notifier(par->net, &ct_event_notifier); +#endif + ct_event_notifier_registered = 0; + + pr_debug("xt_FULLCONENAT: fullconenat_tg_destroy(): ct_event_notifier unregistered\n"); + + } + } + + mutex_unlock(&nf_ct_net_event_lock); + + nf_ct_netns_put(par->net, par->family); +} + +static struct xt_target tg_reg[] __read_mostly = { +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + { + .name = "FULLCONENAT", + .family = NFPROTO_IPV6, + .revision = 0, + .target = fullconenat_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .checkentry = fullconenat_tg_check, + .destroy = fullconenat_tg_destroy, + .me = THIS_MODULE, + }, +#endif + { + .name = "FULLCONENAT", + .family = NFPROTO_IPV4, + .revision = 0, + .target = fullconenat_tg, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .checkentry = fullconenat_tg_check, + .destroy = fullconenat_tg_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init fullconenat_tg_init(void) +{ + int ret; + wq = create_singlethread_workqueue("xt_FULLCONENAT"); + if (wq == NULL) { + printk("xt_FULLCONENAT: warning: failed to create workqueue\n"); + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) + ret = nf_nat_masquerade_inet_register_notifiers(); + if (unlikely(ret)) + return ret; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (unlikely(ret)) + return ret; +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) + ret = nf_nat_masquerade_ipv6_register_notifier(); + if (unlikely(ret)) { + nf_nat_masquerade_ipv4_unregister_notifier(); + return ret; + } +#endif +#else +#if IS_MODULE(CONFIG_IP_NF_TARGET_MASQUERADE) + if (!xt_find_revision(AF_INET, "MASQUERADE", 0, 1, &ret)) + request_module_nowait("ipt_MASQUERADE"); +#endif +#if IS_MODULE(CONFIG_IP6_NF_TARGET_MASQUERADE) + if (!xt_find_revision(AF_INET6, "MASQUERADE", 0, 1, &ret)) + request_module_nowait("ip6t_MASQUERADE"); +#endif +#endif + + return xt_register_targets(tg_reg, ARRAY_SIZE(tg_reg)); +} + +static void fullconenat_tg_exit(void) +{ + xt_unregister_targets(tg_reg, ARRAY_SIZE(tg_reg)); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) + nf_nat_masquerade_inet_unregister_notifiers(); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) +#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) + nf_nat_masquerade_ipv6_unregister_notifier(); +#endif + nf_nat_masquerade_ipv4_unregister_notifier(); +#endif + + if (wq) { + cancel_delayed_work_sync(&gc_worker_wk); + flush_workqueue(wq); + destroy_workqueue(wq); + } + + handle_dying_tuples(); + destroy_mappings(); +} + +module_init(fullconenat_tg_init); +module_exit(fullconenat_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: implementation of RFC3489 full cone NAT"); +MODULE_AUTHOR("Chion Tang "); +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +MODULE_ALIAS("ip6t_FULLCONENAT"); +#endif +MODULE_ALIAS("ipt_FULLCONENAT"); -- 2.36.0