[ { "assumptions": [ "MDP with state/action spaces, transition dynamics, reward function, and discount factor", "Static offline dataset collected by a behavior policy", "Rewards are bounded as stated in preliminaries" ], "authors": [ "Aviral Kumar", "Aurick Zhou", "George Tucker", "Sergey Levine" ], "citation": "Kumar, A., Zhou, A., Tucker, G., & Levine, S. (2020). Conservative Q-Learning for Offline Reinforcement Learning. NeurIPS 2020. https://doi.org/10.48550/arXiv.2006.04779", "claims": [ "CQL produces lower-bounded policy value estimates that reduce overestimation from out-of-distribution actions", "CQL can be implemented as a small modification to standard Q-learning or actor-critic algorithms", "CQL substantially outperforms prior offline RL methods on several benchmarks" ], "conclusions": [ "Conservative Q-value regularization yields stable offline RL training and competitive performance across benchmark domains" ], "contributions": [ "Defines the CQL framework that adds a conservative Q-value regularizer to standard Bellman error objectives", "Provides theoretical guarantees that learned Q-values lower-bound the policy value under suitable conditions", "Demonstrates strong empirical performance on offline RL benchmarks, including complex and multi-modal datasets" ], "future_work": [], "key_equations": [], "limitations": [], "source_type": "paper", "summary": "Proposes Conservative Q-Learning (CQL), a Q-function regularization framework that learns conservative value estimates to mitigate overestimation from distribution shift in offline RL, with theory showing lower-bounded policy value and strong empirical gains on discrete and continuous control benchmarks including D4RL.", "title": "Conservative Q-Learning for Offline Reinforcement Learning", "url": "https://doi.org/10.48550/arXiv.2006.04779", "year": 2020 }, { "assumptions": [ "MDP formulation with discounted returns", "Offline dataset of transitions collected by a behavior policy", "Policy extraction relies on advantage-weighted behavioral cloning over dataset actions" ], "authors": [ "Ilya Kostrikov", "Ashvin Nair", "Sergey Levine" ], "citation": "Kostrikov, I., Nair, A., & Levine, S. (2021). Offline Reinforcement Learning with Implicit Q-Learning. ICLR 2022. https://doi.org/10.48550/arXiv.2110.06169", "claims": [ "IQL never queries unseen actions during training while still enabling policy improvement", "IQL matches or exceeds prior methods on D4RL benchmarks, including challenging AntMaze tasks", "IQL is simple and computationally efficient to implement" ], "conclusions": [ "IQL provides an efficient offline RL method that combines in-sample learning with strong benchmark performance" ], "contributions": [ "Proposes an offline RL algorithm that performs multi-step dynamic programming without evaluating out-of-distribution actions", "Uses expectile regression to approximate high-value actions within dataset support", "Demonstrates state-of-the-art performance on D4RL benchmarks and efficient training" ], "future_work": [], "key_equations": [], "limitations": [], "source_type": "paper", "summary": "Introduces Implicit Q-Learning (IQL), which avoids querying unseen actions by using expectile regression for value learning and advantage-weighted behavioral cloning for policy extraction; achieves strong performance on D4RL while remaining computationally efficient.", "title": "Offline Reinforcement Learning with Implicit Q-Learning", "url": "https://doi.org/10.48550/arXiv.2110.06169", "year": 2021 }, { "assumptions": [ "Offline dataset of transitions collected by arbitrary behavior policies", "Uses TD3 as the base algorithm with deterministic policy gradients", "State normalization over the dataset improves stability" ], "authors": [ "Scott Fujimoto", "Shixiang Shane Gu" ], "citation": "Fujimoto, S., & Gu, S. S. (2021). A Minimalist Approach to Offline Reinforcement Learning. NeurIPS 2021. https://doi.org/10.48550/arXiv.2106.06860", "claims": [ "TD3+BC can match state-of-the-art offline RL performance with minimal changes to TD3", "Normalization and BC regularization are sufficient to stabilize offline learning on D4RL", "Simpler algorithms reduce computational overhead and tuning complexity" ], "conclusions": [ "Minimal modifications to standard off-policy RL can yield strong offline performance while improving simplicity and efficiency" ], "contributions": [ "Shows that a single behavior-cloning regularizer added to TD3 can achieve strong offline RL performance", "Highlights the importance of minimal algorithmic changes for reproducibility and tuning", "Provides empirical comparisons on D4RL demonstrating competitive performance and faster runtime" ], "future_work": [], "key_equations": [], "limitations": [ "Offline-trained policies can exhibit high variance across episodes and instability across training checkpoints, complicating evaluation and deployment" ], "source_type": "paper", "summary": "Presents TD3+BC, a minimalist offline RL baseline that adds a behavior cloning term to the TD3 policy update and applies dataset state normalization, showing competitive or state-of-the-art results on D4RL with reduced complexity and runtime.", "title": "A Minimalist Approach to Offline Reinforcement Learning", "url": "https://doi.org/10.48550/arXiv.2106.06860", "year": 2021 }, { "assumptions": [ "Offline RL is evaluated using static datasets with simulator-based evaluation", "Benchmarks should reflect diverse data-collection strategies and task difficulties" ], "authors": [ "Justin Fu", "Aviral Kumar", "Ofir Nachum", "George Tucker", "Sergey Levine" ], "citation": "Fu, J., Kumar, A., Nachum, O., Tucker, G., & Levine, S. (2020). D4RL: Datasets for Deep Data-Driven Reinforcement Learning. arXiv. https://doi.org/10.48550/arXiv.2004.07219", "claims": [ "Existing offline RL methods struggle on datasets with properties mirroring real-world data collection", "Standardized datasets and evaluation protocols are essential for progress in offline RL" ], "conclusions": [ "D4RL supplies an accessible benchmark that exposes offline RL failure modes and supports reproducible comparisons" ], "contributions": [ "Defines a broad benchmark suite with datasets collected via multiple behavior policies and task types", "Provides evaluation protocols and baseline results to compare offline RL methods", "Highlights deficiencies of prior algorithms when evaluated on more realistic offline datasets" ], "future_work": [ "Expand benchmarks as simulators mature to cover additional real-world task classes" ], "key_equations": [], "limitations": [ "Real-world evaluation is difficult; D4RL relies on high-quality simulators as a compromise" ], "source_type": "paper", "summary": "Introduces D4RL, a standardized suite of offline RL datasets, tasks, and evaluation protocols across locomotion, navigation, manipulation, and other domains, revealing limitations of existing algorithms and providing a common benchmark for offline RL research.", "title": "D4RL: Datasets for Deep Data-Driven Reinforcement Learning", "url": "https://doi.org/10.48550/arXiv.2004.07219", "year": 2020 }, { "assumptions": [ "MDP with discounted returns", "Offline dataset of transitions used for training without environment interaction", "Q-ensemble uncertainty correlates with out-of-distribution actions" ], "authors": [ "Gaon An", "Seungyong Moon", "Jang-Hyun Kim", "Hyun Oh Song" ], "citation": "An, G., Moon, S., Kim, J.-H., & Song, H. O. (2021). Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble. NeurIPS 2021. https://doi.org/10.48550/arXiv.2110.01548", "claims": [ "Clipped Q-learning acts as an uncertainty-aware penalty for out-of-distribution actions in offline RL", "EDAC achieves state-of-the-art performance on many D4RL tasks while using fewer Q-networks", "EDAC avoids explicit behavior policy estimation or OOD action sampling" ], "conclusions": [ "Uncertainty-aware ensemble methods can improve offline RL performance while reducing computational costs via diversification" ], "contributions": [ "Shows clipped Q-learning with large ensembles can outperform prior offline RL methods", "Introduces gradient diversification regularizer to reduce required ensemble size", "Evaluates EDAC on D4RL MuJoCo and Adroit datasets with competitive results" ], "future_work": [], "key_equations": [], "limitations": [], "source_type": "paper", "summary": "Proposes an uncertainty-based offline RL method using clipped Q-learning with large Q-ensembles and introduces Ensemble-Diversified Actor-Critic (EDAC) to reduce ensemble size via gradient diversification, achieving strong performance on D4RL benchmarks without explicit behavior policy estimation.", "title": "Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble", "url": "https://doi.org/10.48550/arXiv.2110.01548", "year": 2021 } ]