[ { "assumptions": [ "Diagonal Fisher approximation is sufficient.", "Previous-task optima are encoded in \u03b8* and F." ], "authors": [ "James Kirkpatrick", "Razvan Pascanu", "Neil Rabinowitz", "et al." ], "citation": "s1", "claims": [ "Penalizing changes to important parameters mitigates catastrophic forgetting." ], "conclusions": [ "EWC improves retention versus naive fine-tuning in several sequential settings." ], "contributions": [ "EWC regularizer based on Fisher information.", "Sequential training results on Atari and supervised tasks." ], "future_work": [ "Better posterior approximations.", "Task-free variants without explicit task boundaries." ], "key_equations": [ "L(\u03b8)=L_t(\u03b8)+\\frac{\u03bb}{2}\\sum_i F_i(\u03b8_i-\u03b8_i^*)^2" ], "limitations": [ "Requires storing per-parameter importance.", "Diagonal approximation can miss parameter interactions." ], "source_type": "paper", "summary": "Introduces Elastic Weight Consolidation (EWC), which constrains updates on parameters with high Fisher importance to reduce forgetting in sequential learning. It matters because it formalizes the stability-plasticity trade-off as a parameter-space regularization objective.", "title": "Overcoming catastrophic forgetting in neural networks", "url": "https://arxiv.org/abs/1612.00796", "year": 2016 }, { "assumptions": [ "Local gradient-path statistics reflect parameter importance.", "Task transitions can be detected to consolidate importance." ], "authors": [ "Friedemann Zenke", "Ben Poole", "Surya Ganguli" ], "citation": "s2", "claims": [ "SI attains forgetting reduction with lower compute than EWC-like estimates." ], "conclusions": [ "Path-integral importance is a practical CL regularizer." ], "contributions": [ "Online importance accumulation.", "Competitive retention on benchmark streams." ], "future_work": [ "Boundary-free continual consolidation." ], "key_equations": [ "\\Omega_i^\\mu=\\sum_t \\frac{\\Delta \u03b8_i(t)\\,g_i(t)}{(\\Delta \u03b8_i^\\mu)^2+\u03be}", "L=L_t+c\\sum_i \\Omega_i(\u03b8_i-\\tilde{\u03b8}_i)^2" ], "limitations": [ "Still depends on consolidation points.", "May underperform with heavy distribution drift." ], "source_type": "paper", "summary": "Proposes Synaptic Intelligence (SI), an online path-integral measure of parameter importance that regularizes future updates. It matters for low-overhead continual regularization without needing second-order matrices.", "title": "Continual Learning Through Synaptic Intelligence", "url": "https://arxiv.org/abs/1703.04200", "year": 2017 }, { "assumptions": [ "Stored episodic samples represent prior tasks sufficiently.", "Task identity exists during training for memory partitioning." ], "authors": [ "David Lopez-Paz", "Marc'Aurelio Ranzato" ], "citation": "s3", "claims": [ "Projected gradients reduce forgetting while permitting transfer." ], "conclusions": [ "Memory-constrained optimization is effective but costly." ], "contributions": [ "Inequality-constrained continual optimization.", "Forgetting/transfer metrics formalization." ], "future_work": [ "Cheaper approximations and task-agnostic memory." ], "key_equations": [ "\\min_{\\tilde g} \\frac{1}{2}||g-\\tilde g||_2^2\\;\\text{s.t.}\\;\\langle \\tilde g,g_k\\rangle \\ge 0,\\forall k \\tau]" ], "limitations": [ "Explicit masks/task IDs conflict with task-agnostic requirement." ], "source_type": "paper", "summary": "Represents multiple tasks in a single weight tensor via task-specific binary supermasks. It matters as a high-capacity masking baseline and a direct comparator to mask-free activation adaptation.", "title": "Supermasks in Superposition", "url": "https://arxiv.org/abs/2006.14769", "year": 2020 }, { "assumptions": [ "Standardized APIs improve experimental comparability." ], "authors": [ "Vincenzo Lomonaco", "Lorenzo Pellegrini", "Andrea Cossu", "et al." ], "citation": "s16", "claims": [ "Reduces engineering friction and evaluation inconsistency." ], "conclusions": [ "Framework support accelerates CL research cycles." ], "contributions": [ "Unified open-source CL framework." ], "future_work": [ "Broader task-free/online RL continual scenarios." ], "key_equations": [], "limitations": [ "Coverage depends on implemented plugins/scenarios." ], "source_type": "paper", "summary": "Presents a unified toolkit for CL scenarios, strategies, metrics, and benchmarks. It matters for reproducible evaluation and fast ablation of activation-function proposals.", "title": "Avalanche: an End-to-End Library for Continual Learning", "url": "https://arxiv.org/abs/2104.00405", "year": 2021 }, { "assumptions": [ "Evaluation protocol materially changes method ranking." ], "authors": [ "Ibrahim Khalil Adam", "et al." ], "citation": "s17", "claims": [ "Static evaluation snapshots can overestimate robustness." ], "conclusions": [ "Temporal diagnostics are necessary in lifelong settings." ], "contributions": [ "Stability-gap framing and continual evaluation recommendations." ], "future_work": [ "Standardized continual-evaluation leaderboards." ], "key_equations": [], "limitations": [ "Protocol adoption is not yet universal." ], "source_type": "paper", "summary": "Argues conventional evaluation can hide instability and proposes continual evaluation diagnostics. It matters for assessing online adaptive activations beyond final accuracy.", "title": "Continual evaluation for lifelong learning: Identifying the stability gap", "url": "https://arxiv.org/abs/2205.13452", "year": 2022 }, { "assumptions": [ "Existing CL taxonomy can organize fast-evolving methods." ], "authors": [ "Liyuan Wang", "Xianglong Liu", "et al." ], "citation": "s18", "claims": [ "No single method dominates across all settings." ], "conclusions": [ "Task-free and scalable CL remain open challenges." ], "contributions": [ "Broad taxonomy and benchmark synthesis." ], "future_work": [ "Unified protocols and realistic continual streams." ], "key_equations": [], "limitations": [ "Survey lag versus rapidly appearing methods." ], "source_type": "paper", "summary": "Large-scale survey covering CL settings, methods, theory, and applications. It matters for positioning activation-centric CL among replay, regularization, and architecture families.", "title": "A Comprehensive Survey of Continual Learning: Theory, Method and Application", "url": "https://arxiv.org/abs/2302.00487", "year": 2023 }, { "assumptions": [ "Sequence distributions follow defined shift model.", "Hypothesis class supports derived generalization bounds." ], "authors": [ "Yunwen Lei", "Yingbin Liang" ], "citation": "s19", "claims": [ "Certain replay/regularization schemes have provable advantages under shift assumptions." ], "conclusions": [ "Theoretical tools can guide practical CL design." ], "contributions": [ "Formal CL learnability definitions and bounds." ], "future_work": [ "Bridge bounds with large-scale empirical deep CL." ], "key_equations": [ "\\mathcal{R}_{CL}(f)=\\sum_t w_t\\,\\mathbb{E}_{(x,y)\\sim D_t}[\\ell(f_t(x),y)]" ], "limitations": [ "Theory relies on simplified assumptions relative to deep practice." ], "source_type": "paper", "summary": "Provides theoretical framing for CL learnability and algorithmic implications. It matters for grounding activation-update rules in formal conditions.", "title": "Learnability and Algorithm for Continual Learning", "url": "https://arxiv.org/abs/2306.12646", "year": 2023 }, { "assumptions": [ "Search space of compositional nonlinearities contains transferable candidates." ], "authors": [ "Yuanliang Zhang", "et al." ], "citation": "s20", "claims": [ "Discovered activations can outperform fixed hand-designed functions." ], "conclusions": [ "Automated activation design is promising for adaptability." ], "contributions": [ "Activation mining/search pipeline.", "Cross-domain transfer analysis." ], "future_work": [ "Online/task-free activation adaptation during continual streams." ], "key_equations": [ "f(x)=\\sum_k a_k\\,\\phi_k(b_k x+c_k)", "\\theta^*=\\arg\\max_\\theta \\mathbb{E}_{\\mathcal{T}}[\\mathrm{Perf}(f_\\theta,\\mathcal{T})]" ], "limitations": [ "Search cost and reproducibility sensitivity." ], "source_type": "paper", "summary": "Explores automated discovery of activation functions with stronger cross-task generalization. It matters directly for designing adaptive nonlinearities for continual learning.", "title": "Mining Generalizable Activation Functions", "url": "https://arxiv.org/abs/2602.05688", "year": 2026 }, { "assumptions": [ "Smooth stochastic-like gating improves optimization/generalization." ], "authors": [ "Dan Hendrycks", "Kevin Gimpel" ], "citation": "s21", "claims": [ "GELU often outperforms ReLU/ELU on modern architectures." ], "conclusions": [ "Value-weighted smooth gating is effective." ], "contributions": [ "GELU activation and empirical benchmarking." ], "future_work": [ "Dynamic parameterized GELU variants." ], "key_equations": [ "\\mathrm{GELU}(x)=x\\Phi(x)=0.5x\\left(1+\\mathrm{erf}(x/\\sqrt{2})\\right)" ], "limitations": [ "No continual-adaptation mechanism built in." ], "source_type": "paper", "summary": "Defines GELU, a smooth nonlinearity weighting inputs by value via Gaussian CDF. It matters as a baseline non-saturating activation with stable gradients.", "title": "Gaussian Error Linear Units (GELUs)", "url": "https://arxiv.org/abs/1606.08415", "year": 2016 }, { "assumptions": [ "Inputs satisfy independence/finite variance approximations.", "Network architecture conditions for contraction hold." ], "authors": [ "Gunter Klambauer", "Thomas Unterthiner", "Andreas Mayr", "Sepp Hochreiter" ], "citation": "s22", "claims": [ "Layer statistics converge to stable region without batch norm." ], "conclusions": [ "Activation design can enforce statistical stability." ], "contributions": [ "SELU activation and self-normalization theory." ], "future_work": [ "Self-normalizing activations for modern deep architectures and CL." ], "key_equations": [ "\\mathrm{SELU}(x)=\\lambda\\begin{cases}x,&x>0\\\\\\alpha(e^x-1),&x\\le0\\end{cases}", "(\\mu',\\sigma'^2)=g(\\mu,\\sigma^2)" ], "limitations": [ "Assumptions can weaken under residual/attention architectures." ], "source_type": "paper", "summary": "Introduces SELU and fixed-point analysis showing mean/variance contraction toward stable values. It matters for the self-stabilization requirement in deep continual networks.", "title": "Self-Normalizing Neural Networks", "url": "https://arxiv.org/abs/1706.02515", "year": 2017 }, { "assumptions": [ "Search spaces include transferable nonlinear forms." ], "authors": [ "Prajit Ramachandran", "Barret Zoph", "Quoc V. Le" ], "citation": "s23", "claims": [ "Swish improves accuracy on multiple benchmarks." ], "conclusions": [ "Automated discovery is effective for activation design." ], "contributions": [ "Activation-function search methodology; Swish." ], "future_work": [ "Task-aware or continual activation search." ], "key_equations": [ "\\mathrm{Swish}(x)=x\\cdot\\sigma(\\beta x)" ], "limitations": [ "Search can be computationally expensive." ], "source_type": "paper", "summary": "Discovers Swish via automatic search and reports improved performance over ReLU in several settings. It matters as evidence that activation search can outperform fixed handcrafted choices.", "title": "Searching for Activation Functions", "url": "https://arxiv.org/abs/1710.05941", "year": 2017 }, { "assumptions": [ "Smooth non-monotonicity helps optimization landscapes." ], "authors": [ "Diganta Misra" ], "citation": "s24", "claims": [ "Competitive or improved performance vs ReLU/Swish in many settings." ], "conclusions": [ "Curved smooth activations can help representation quality." ], "contributions": [ "Mish definition and empirical evaluation." ], "future_work": [ "Adaptive Mish-like parameterizations for nonstationary streams." ], "key_equations": [ "\\mathrm{Mish}(x)=x\\tanh(\\ln(1+e^x))" ], "limitations": [ "No explicit continual-learning mechanism." ], "source_type": "paper", "summary": "Introduces Mish activation with smooth non-monotonic behavior and empirical gains across tasks. It matters as a non-saturating baseline with richer curvature than ReLU.", "title": "Mish: A Self Regularized Non-Monotonic Activation Function", "url": "https://arxiv.org/abs/1908.08681", "year": 2019 }, { "assumptions": [ "Per-channel activation parameters can be trained stably." ], "authors": [ "Xiangyu Jin", "Chao Xu", "Jiashi Feng", "Yizhou Yu", "Dahua Lin" ], "citation": "s25", "claims": [ "SReLU outperforms common fixed activations in tested CNNs." ], "conclusions": [ "Low-parameter adaptive nonlinearities are effective." ], "contributions": [ "Learnable piecewise activation with few parameters." ], "future_work": [ "Online adaptation of piecewise parameters under shift." ], "key_equations": [ "f(x)=\\begin{cases}t_l+a_l(x-t_l),&x\\le t_l\\\\x,&t_l