[ { "arxiv_id": "2511.03913v1", "title": "Evolutionary Optimization Trumps Adam Optimization on Embedding Space Exploration", "authors": [ "Domício Pereira Neto", "João Correia", "Penousal Machado" ], "abstract": "Deep generative models, especially diffusion architectures, have transformed image generation; however, they are challenging to control and optimize for specific goals without expensive retraining. Embedding Space Exploration, especially with Evolutionary Algorithms (EAs), has been shown to be a promising method for optimizing image generation, particularly within Diffusion Models. Therefore, in this work, we study the performance of an evolutionary optimization method, namely Separable Covariance Matrix Adaptation Evolution Strategy (sep-CMA-ES), against the widely adopted Adaptive Moment Estimation (Adam), applied to Stable Diffusion XL Turbo's prompt embedding vector. The evaluation of images combines the LAION Aesthetic Predictor V2 with CLIPScore into a weighted fitness function, allowing flexible trade-offs between visual appeal and adherence to prompts. Experiments on a subset of the Parti Prompts (P2) dataset showcase that sep-CMA-ES consistently yields superior improvements in aesthetic and alignment metrics in comparison to Adam. Results indicate that the evolutionary method provides efficient, gradient-free optimization for diffusion models, enhancing controllability without the need for fine-tuning. This study emphasizes the potential of evolutionary methods for embedding space exploration of deep generative models and outlines future research directions.", "categories": [ "cs.NE", "cs.AI" ], "primary_category": "cs.NE", "comment": "22 pages, 7 figures, 3 tables, 6 appendix figures, 1 appendix table", "pdf_url": "https://arxiv.org/pdf/2511.03913v1", "published_date": "2025-11-05 23:31:54 UTC", "updated_date": "2025-11-05 23:31:54 UTC" }, { "arxiv_id": "2511.03912v2", "title": "I Detect What I Don't Know: Incremental Anomaly Learning with Stochastic Weight Averaging-Gaussian for Oracle-Free Medical Imaging", "authors": [ "Nand Kumar Yadav", "Rodrigue Rizk", "William CW Chen", "KC Santosh" ], "abstract": "Unknown anomaly detection in medical imaging remains a fundamental challenge due to the scarcity of labeled anomalies and the high cost of expert supervision. We introduce an unsupervised, oracle-free framework that incrementally expands a trusted set of normal samples without any anomaly labels. Starting from a small, verified seed of normal images, our method alternates between lightweight adapter updates and uncertainty-gated sample admission. A frozen pretrained vision backbone is augmented with tiny convolutional adapters, ensuring rapid domain adaptation with negligible computational overhead. Extracted embeddings are stored in a compact coreset enabling efficient k-nearest neighbor anomaly (k-NN) scoring. Safety during incremental expansion is enforced by dual probabilistic gates, a sample is admitted into the normal memory only if its distance to the existing coreset lies within a calibrated z-score threshold, and its SWAG-based epistemic uncertainty remains below a seed-calibrated bound. This mechanism prevents drift and false inclusions without relying on generative reconstruction or replay buffers. Empirically, our system steadily refines the notion of normality as unlabeled data arrive, producing substantial gains over baselines. On COVID-CXR, ROC-AUC improves from 0.9489 to 0.9982 (F1: 0.8048 to 0.9746); on Pneumonia CXR, ROC-AUC rises from 0.6834 to 0.8968; and on Brain MRI ND-5, ROC-AUC increases from 0.6041 to 0.7269 and PR-AUC from 0.7539 to 0.8211. These results highlight the effectiveness and efficiency of the proposed framework for real-world, label-scarce medical imaging applications.", "categories": [ "cs.CV", "cs.AI" ], "primary_category": "cs.CV", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03912v2", "published_date": "2025-11-05 23:28:14 UTC", "updated_date": "2025-11-11 18:34:02 UTC" }, { "arxiv_id": "2511.03907v1", "title": "SnappyMeal: Design and Longitudinal Evaluation of a Multimodal AI Food Logging Application", "authors": [ "Liam Bakar", "Zachary Englhardt", "Vidya Srinivas", "Girish Narayanswamy", "Dilini Nissanka", "Shwetak Patel", "Vikram Iyer" ], "abstract": "Food logging, both self-directed and prescribed, plays a critical role in uncovering correlations between diet, medical, fitness, and health outcomes. Through conversations with nutritional experts and individuals who practice dietary tracking, we find current logging methods, such as handwritten and app-based journaling, are inflexible and result in low adherence and potentially inaccurate nutritional summaries. These findings, corroborated by prior literature, emphasize the urgent need for improved food logging methods. In response, we propose SnappyMeal, an AI-powered dietary tracking system that leverages multimodal inputs to enable users to more flexibly log their food intake. SnappyMeal introduces goal-dependent follow-up questions to intelligently seek missing context from the user and information retrieval from user grocery receipts and nutritional databases to improve accuracy. We evaluate SnappyMeal through publicly available nutrition benchmarks and a multi-user, 3-week, in-the-wild deployment capturing over 500 logged food instances. Users strongly praised the multiple available input methods and reported a strong perceived accuracy. These insights suggest that multimodal AI systems can be leveraged to significantly improve dietary tracking flexibility and context-awareness, laying the groundwork for a new class of intelligent self-tracking applications.", "categories": [ "cs.HC", "cs.AI" ], "primary_category": "cs.HC", "comment": "24 pages, 15 figures", "pdf_url": "https://arxiv.org/pdf/2511.03907v1", "published_date": "2025-11-05 23:14:22 UTC", "updated_date": "2025-11-05 23:14:22 UTC" }, { "arxiv_id": "2511.03898v1", "title": "Secure Code Generation at Scale with Reflexion", "authors": [ "Arup Datta", "Ahmed Aljohani", "Hyunsook Do" ], "abstract": "Large language models (LLMs) are now widely used to draft and refactor code, but code that works is not necessarily secure. We evaluate secure code generation using the Instruct Prime, which eliminated compliance-required prompts and cue contamination, and evaluate five instruction-tuned code LLMs using a zero-shot baseline and a three-round reflexion prompting approach. Security is measured using the Insecure Code Detector (ICD), and results are reported by measuring Repair, Regression, and NetGain metrics, considering the programming language and CWE family. Our findings show that insecurity remains common at the first round: roughly 25-33% of programs are insecure at a zero-shot baseline (t0 ). Weak cryptography/config-dependent bugs are the hardest to avoid while templated ones like XSS, code injection, and hard-coded secrets are handled more reliably. Python yields the highest secure rates; C and C# are the lowest, with Java, JS, PHP, and C++ in the middle. Reflexion prompting improves security for all models, improving average accuracy from 70.74% at t0 to 79.43% at t3 , with the largest gains in the first round followed by diminishing returns. The trends with Repair, Regression, and NetGain metrics show that applying one to two rounds produces most of the benefits. A replication package is available at https://doi.org/10.5281/zenodo.17065846.", "categories": [ "cs.CR", "cs.AI", "cs.CE", "cs.SE" ], "primary_category": "cs.CR", "comment": "Accepted for publication at the 2nd IEEE International Conference on AI-powered Software (AIware 2025)", "pdf_url": "https://arxiv.org/pdf/2511.03898v1", "published_date": "2025-11-05 22:46:24 UTC", "updated_date": "2025-11-05 22:46:24 UTC" }, { "arxiv_id": "2511.03891v2", "title": "Improving Diagnostic Performance on Small and Imbalanced Datasets Using Class-Based Input Image Composition", "authors": [ "Hlali Azzeddine", "Majid Ben Yakhlef", "Soulaiman El Hazzat" ], "abstract": "Small, imbalanced datasets and poor input image quality can lead to high false predictions rates with deep learning models. This paper introduces Class-Based Image Composition, an approach that allows us to reformulate training inputs through a fusion of multiple images of the same class into combined visual composites, named Composite Input Images (CoImg). That enhances the intra-class variance and improves the valuable information density per training sample and increases the ability of the model to distinguish between subtle disease patterns. Our method was evaluated on the Optical Coherence Tomography Dataset for Image-Based Deep Learning Methods (OCTDL) (Kulyabin et al., 2024), which contains 2,064 high-resolution optical coherence tomography (OCT) scans of the human retina, representing seven distinct diseases with a significant class imbalance. We constructed a perfectly class-balanced version of this dataset, named Co-OCTDL, where each scan is resented as a 3x1 layout composite image. To assess the effectiveness of this new representation, we conducted a comparative analysis between the original dataset and its variant using a VGG16 model. A fair comparison was ensured by utilizing the identical model architecture and hyperparameters for all experiments. The proposed approach markedly improved diagnostic results.The enhanced Dataset achieved near-perfect accuracy (99.6%) with F1-score (0.995) and AUC (0.9996), compared to a baseline model trained on raw dataset. The false prediction rate was also significantly lower, this demonstrates that the method can producehigh-quality predictions even for weak datasets affected by class imbalance or small sample size.", "categories": [ "cs.CV", "cs.AI", "cs.DB" ], "primary_category": "cs.CV", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03891v2", "published_date": "2025-11-05 22:34:06 UTC", "updated_date": "2025-11-07 06:35:50 UTC" }, { "arxiv_id": "2511.10664v1", "title": "Evaluating Modern Large Language Models on Low-Resource and Morphologically Rich Languages:A Cross-Lingual Benchmark Across Cantonese, Japanese, and Turkish", "authors": [ "Chengxuan Xia", "Qianye Wu", "Hongbin Guan", "Sixuan Tian", "Yilun Hao", "Xiaoyu Wu" ], "abstract": "Large language models (LLMs) have achieved impressive results in high-resource languages like English, yet their effectiveness in low-resource and morphologically rich languages remains underexplored. In this paper, we present a comprehensive evaluation of seven cutting-edge LLMs -- including GPT-4o, GPT-4, Claude~3.5~Sonnet, LLaMA~3.1, Mistral~Large~2, LLaMA-2~Chat~13B, and Mistral~7B~Instruct -- on a new cross-lingual benchmark covering \\textbf{Cantonese, Japanese, and Turkish}. Our benchmark spans four diverse tasks: open-domain question answering, document summarization, English-to-X translation, and culturally grounded dialogue. We combine \\textbf{human evaluations} (rating fluency, factual accuracy, and cultural appropriateness) with automated metrics (e.g., BLEU, ROUGE) to assess model performance.\n Our results reveal that while the largest proprietary models (GPT-4o, GPT-4, Claude~3.5) generally lead across languages and tasks, significant gaps persist in culturally nuanced understanding and morphological generalization. Notably, GPT-4o demonstrates robust multilingual performance even on cross-lingual tasks, and Claude~3.5~Sonnet achieves competitive accuracy on knowledge and reasoning benchmarks. However, all models struggle to some extent with the unique linguistic challenges of each language, such as Turkish agglutinative morphology and Cantonese colloquialisms. Smaller open-source models (LLaMA-2~13B, Mistral~7B) lag substantially in fluency and accuracy, highlighting the resource disparity. We provide detailed quantitative results, qualitative error analysis, and discuss implications for developing more culturally aware and linguistically generalizable LLMs. Our benchmark and evaluation data are released to foster reproducibility and further research.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "This paper requires XeLaTeX for proper Unicode rendering of Japanese and Cantonese text", "pdf_url": "https://arxiv.org/pdf/2511.10664v1", "published_date": "2025-11-05 22:09:53 UTC", "updated_date": "2025-11-05 22:09:53 UTC" }, { "arxiv_id": "2511.03882v1", "title": "Investigating Robot Control Policy Learning for Autonomous X-ray-guided Spine Procedures", "authors": [ "Florence Klitzner", "Blanca Inigo", "Benjamin D. Killeen", "Lalithkumar Seenivasan", "Michelle Song", "Axel Krieger", "Mathias Unberath" ], "abstract": "Imitation learning-based robot control policies are enjoying renewed interest in video-based robotics. However, it remains unclear whether this approach applies to X-ray-guided procedures, such as spine instrumentation. This is because interpretation of multi-view X-rays is complex. We examine opportunities and challenges for imitation policy learning in bi-plane-guided cannula insertion. We develop an in silico sandbox for scalable, automated simulation of X-ray-guided spine procedures with a high degree of realism. We curate a dataset of correct trajectories and corresponding bi-planar X-ray sequences that emulate the stepwise alignment of providers. We then train imitation learning policies for planning and open-loop control that iteratively align a cannula solely based on visual information. This precisely controlled setup offers insights into limitations and capabilities of this method. Our policy succeeded on the first attempt in 68.5% of cases, maintaining safe intra-pedicular trajectories across diverse vertebral levels. The policy generalized to complex anatomy, including fractures, and remained robust to varied initializations. Rollouts on real bi-planar X-rays further suggest that the model can produce plausible trajectories, despite training exclusively in simulation. While these preliminary results are promising, we also identify limitations, especially in entry point precision. Full closed-look control will require additional considerations around how to provide sufficiently frequent feedback. With more robust priors and domain knowledge, such models may provide a foundation for future efforts toward lightweight and CT-free robotic intra-operative spinal navigation.", "categories": [ "cs.CV", "cs.AI", "cs.LG", "cs.RO" ], "primary_category": "cs.CV", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03882v1", "published_date": "2025-11-05 22:00:48 UTC", "updated_date": "2025-11-05 22:00:48 UTC" }, { "arxiv_id": "2511.03878v1", "title": "KnowThyself: An Agentic Assistant for LLM Interpretability", "authors": [ "Suraj Prasai", "Mengnan Du", "Ying Zhang", "Fan Yang" ], "abstract": "We develop KnowThyself, an agentic assistant that advances large language model (LLM) interpretability. Existing tools provide useful insights but remain fragmented and code-intensive. KnowThyself consolidates these capabilities into a chat-based interface, where users can upload models, pose natural language questions, and obtain interactive visualizations with guided explanations. At its core, an orchestrator LLM first reformulates user queries, an agent router further directs them to specialized modules, and the outputs are finally contextualized into coherent explanations. This design lowers technical barriers and provides an extensible platform for LLM inspection. By embedding the whole process into a conversational workflow, KnowThyself offers a robust foundation for accessible LLM interpretability.", "categories": [ "cs.AI", "cs.IR", "cs.LG", "cs.MA" ], "primary_category": "cs.AI", "comment": "5 pages, 1 figure, Accepted for publication at the Demonstration Track of the 40th AAAI Conference on Artificial Intelligence (AAAI 26)", "pdf_url": "https://arxiv.org/pdf/2511.03878v1", "published_date": "2025-11-05 21:48:13 UTC", "updated_date": "2025-11-05 21:48:13 UTC" }, { "arxiv_id": "2511.03866v2", "title": "OMPILOT: Harnessing Transformer Models for Auto Parallelization to Shared Memory Computing Paradigms", "authors": [ "Arijit Bhattacharjee", "Ali TehraniJamsaz", "Le Chen", "Niranjan Hasabnis", "Mihai Capota", "Nesreen Ahmed", "Ali Jannesari" ], "abstract": "Recent advances in large language models (LLMs) have significantly accelerated progress in code translation, enabling more accurate and efficient transformation across programming languages. While originally developed for natural language processing, LLMs have shown strong capabilities in modeling programming language syntax and semantics, outperforming traditional rule-based systems in both accuracy and flexibility. These models have streamlined cross-language conversion, reduced development overhead, and accelerated legacy code migration. In this paper, we introduce OMPILOT, a novel domain-specific encoder-decoder transformer tailored for translating C++ code into OpenMP, enabling effective shared-memory parallelization. OMPILOT leverages custom pre-training objectives that incorporate the semantics of parallel constructs and combines both unsupervised and supervised learning strategies to improve code translation robustness. Unlike previous work that focused primarily on loop-level transformations, OMPILOT operates at the function level to capture a wider semantic context. To evaluate our approach, we propose OMPBLEU, a novel composite metric specifically crafted to assess the correctness and quality of OpenMP parallel constructs, addressing limitations in conventional translation metrics.", "categories": [ "cs.DC", "cs.AI", "cs.LG", "cs.PF", "cs.PL" ], "primary_category": "cs.DC", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03866v2", "published_date": "2025-11-05 21:21:15 UTC", "updated_date": "2025-11-11 15:25:21 UTC" }, { "arxiv_id": "2511.03859v1", "title": "Levers of Power in the Field of AI", "authors": [ "Tammy Mackenzie", "Sukriti Punj", "Natalie Perez", "Sreyoshi Bhaduri", "Branislav Radeljic" ], "abstract": "This paper examines how decision makers in academia, government, business, and civil society navigate questions of power in implementations of artificial intelligence. The study explores how individuals experience and exercise levers of power, which are presented as social mechanisms that shape institutional responses to technological change. The study reports on the responses of personalized questionnaires designed to gather insight on a decision maker's institutional purview, based on an institutional governance framework developed from the work of Neo-institutionalists. Findings present the anonymized, real responses and circumstances of respondents in the form of twelve fictional personas of high-level decision makers from North America and Europe. These personas illustrate how personal agency, organizational logics, and institutional infrastructures may intersect in the governance of AI. The decision makers' responses to the questionnaires then inform a discussion of the field-level personal power of decision makers, methods of fostering institutional stability in times of change, and methods of influencing institutional change in the field of AI. The final section of the discussion presents a table of the dynamics of the levers of power in the field of AI for change makers and five testable hypotheses for institutional and social movement researchers. In summary, this study provides insight on the means for policymakers within institutions and their counterparts in civil society to personally engage with AI governance.", "categories": [ "cs.CY", "cs.AI" ], "primary_category": "cs.CY", "comment": "18 pages, research submission", "pdf_url": "https://arxiv.org/pdf/2511.03859v1", "published_date": "2025-11-05 21:03:57 UTC", "updated_date": "2025-11-05 21:03:57 UTC" }, { "arxiv_id": "2511.03855v1", "title": "Noise Injection: Improving Out-of-Distribution Generalization for Limited Size Datasets", "authors": [ "Duong Mai", "Lawrence Hall" ], "abstract": "Deep learned (DL) models for image recognition have been shown to fail to generalize to data from different devices, populations, etc. COVID-19 detection from Chest X-rays (CXRs), in particular, has been shown to fail to generalize to out-of-distribution (OOD) data from new clinical sources not covered in the training set. This occurs because models learn to exploit shortcuts - source-specific artifacts that do not translate to new distributions - rather than reasonable biomarkers to maximize performance on in-distribution (ID) data. Rendering the models more robust to distribution shifts, our study investigates the use of fundamental noise injection techniques (Gaussian, Speckle, Poisson, and Salt and Pepper) during training. Our empirical results demonstrate that this technique can significantly reduce the performance gap between ID and OOD evaluation from 0.10-0.20 to 0.01-0.06, based on results averaged over ten random seeds across key metrics such as AUC, F1, accuracy, recall and specificity. Our source code is publicly available at https://github.com/Duongmai127/Noisy-ood", "categories": [ "cs.CV", "cs.AI" ], "primary_category": "cs.CV", "comment": "Abstract accepted for oral presentation at SPIE Medical Imaging 2026: Computer-Aided Diagnosis", "pdf_url": "https://arxiv.org/pdf/2511.03855v1", "published_date": "2025-11-05 20:53:59 UTC", "updated_date": "2025-11-05 20:53:59 UTC" }, { "arxiv_id": "2511.03845v1", "title": "To See or To Read: User Behavior Reasoning in Multimodal LLMs", "authors": [ "Tianning Dong", "Luyi Ma", "Varun Vasudevan", "Jason Cho", "Sushant Kumar", "Kannan Achan" ], "abstract": "Multimodal Large Language Models (MLLMs) are reshaping how modern agentic systems reason over sequential user-behavior data. However, whether textual or image representations of user behavior data are more effective for maximizing MLLM performance remains underexplored. We present \\texttt{BehaviorLens}, a systematic benchmarking framework for assessing modality trade-offs in user-behavior reasoning across six MLLMs by representing transaction data as (1) a text paragraph, (2) a scatter plot, and (3) a flowchart. Using a real-world purchase-sequence dataset, we find that when data is represented as images, MLLMs next-purchase prediction accuracy is improved by 87.5% compared with an equivalent textual representation without any additional computational cost.", "categories": [ "cs.AI", "cs.LG" ], "primary_category": "cs.AI", "comment": "Accepted by the 39th Conference on Neural Information Processing Systems (NeurIPS 2025) Workshop: Efficient Reasoning", "pdf_url": "https://arxiv.org/pdf/2511.03845v1", "published_date": "2025-11-05 20:26:40 UTC", "updated_date": "2025-11-05 20:26:40 UTC" }, { "arxiv_id": "2511.03826v3", "title": "CORE -- A Cell-Level Coarse-to-Fine Image Registration Engine for Multi-stain Image Alignment", "authors": [ "Esha Sadia Nasir", "Behnaz Elhaminia", "Mark Eastwood", "Catherine King", "Owen Cain", "Lorraine Harper", "Paul Moss", "Dimitrios Chanouzas", "David Snead", "Nasir Rajpoot", "Adam Shephard", "Shan E Ahmed Raza" ], "abstract": "Accurate and efficient registration of whole slide images (WSIs) is essential for high-resolution, nuclei-level analysis in multi-stained tissue slides. We propose a novel coarse-to-fine framework CORE for accurate nuclei-level registration across diverse multimodal whole-slide image (WSI) datasets. The coarse registration stage leverages prompt-based tissue mask extraction to effectively filter out artefacts and non-tissue regions, followed by global alignment using tissue morphology and ac- celerated dense feature matching with a pre-trained feature extractor. From the coarsely aligned slides, nuclei centroids are detected and subjected to fine-grained rigid registration using a custom, shape-aware point-set registration model. Finally, non-rigid alignment at the cellular level is achieved by estimating a non-linear dis- placement field using Coherent Point Drift (CPD). Our approach benefits from automatically generated nuclei that enhance the accuracy of deformable registra- tion and ensure precise nuclei-level correspondence across modalities. The pro- posed model is evaluated on three publicly available WSI registration datasets, and two private datasets. We show that CORE outperforms current state-of-the-art methods in terms of generalisability, precision, and robustness in bright-field and immunofluorescence microscopy WSIs", "categories": [ "q-bio.QM", "cs.AI" ], "primary_category": "q-bio.QM", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03826v3", "published_date": "2025-11-05 19:47:41 UTC", "updated_date": "2025-11-25 09:52:36 UTC" }, { "arxiv_id": "2511.03825v1", "title": "How Different Tokenization Algorithms Impact LLMs and Transformer Models for Binary Code Analysis", "authors": [ "Ahmed Mostafa", "Raisul Arefin Nahid", "Samuel Mulder" ], "abstract": "Tokenization is fundamental in assembly code analysis, impacting intrinsic characteristics like vocabulary size, semantic coverage, and extrinsic performance in downstream tasks. Despite its significance, tokenization in the context of assembly code remains an underexplored area. This study aims to address this gap by evaluating the intrinsic properties of Natural Language Processing (NLP) tokenization models and parameter choices, such as vocabulary size. We explore preprocessing customization options and pre-tokenization rules tailored to the unique characteristics of assembly code. Additionally, we assess their impact on downstream tasks like function signature prediction -- a critical problem in binary code analysis.\n To this end, we conduct a thorough study on various tokenization models, systematically analyzing their efficiency in encoding assembly instructions and capturing semantic nuances. Through intrinsic evaluations, we compare tokenizers based on tokenization efficiency, vocabulary compression, and representational fidelity for assembly code. Using state-of-the-art pre-trained models such as the decoder-only Large Language Model (LLM) Llama 3.2, the encoder-only transformer BERT, and the encoder-decoder model BART, we evaluate the effectiveness of these tokenizers across multiple performance metrics. Preliminary findings indicate that tokenizer choice significantly influences downstream performance, with intrinsic metrics providing partial but incomplete predictability of extrinsic evaluation outcomes. These results reveal complex trade-offs between intrinsic tokenizer properties and their utility in practical assembly code tasks. Ultimately, this study provides valuable insights into optimizing tokenization models for low-level code analysis, contributing to the robustness and scalability of Natural Language Model (NLM)-based binary analysis workflows.", "categories": [ "cs.AI", "cs.CL", "cs.CR", "cs.LG" ], "primary_category": "cs.AI", "comment": "Publication Notice. This paper was published in the BAR 2025 Workshop (with NDSS 2025) and is for research and educational use. Copyright \\c{opyright} 2025 Internet Society. All rights reserved. Personal/classroom reproduction is permitted with this notice and full paper citation. All other uses, including commercial, require prior written permission from the Internet Society", "pdf_url": "https://arxiv.org/pdf/2511.03825v1", "published_date": "2025-11-05 19:45:26 UTC", "updated_date": "2025-11-05 19:45:26 UTC" }, { "arxiv_id": "2511.03823v1", "title": "PLLuM: A Family of Polish Large Language Models", "authors": [ "Jan Kocoń", "Maciej Piasecki", "Arkadiusz Janz", "Teddy Ferdinan", "Łukasz Radliński", "Bartłomiej Koptyra", "Marcin Oleksy", "Stanisław Woźniak", "Paweł Walkowiak", "Konrad Wojtasik", "Julia Moska", "Tomasz Naskręt", "Bartosz Walkowiak", "Mateusz Gniewkowski", "Kamil Szyc", "Dawid Motyka", "Dawid Banach", "Jonatan Dalasiński", "Ewa Rudnicka", "Bartłomiej Alberski", "Tomasz Walkowiak", "Aleksander Szczęsny", "Maciej Markiewicz", "Tomasz Bernaś", "Hubert Mazur", "Kamil Żyta", "Mateusz Tykierko", "Grzegorz Chodak", "Tomasz Kajdanowicz", "Przemysław Kazienko", "Agnieszka Karlińska", "Karolina Seweryn", "Anna Kołos", "Maciej Chrabąszcz", "Katarzyna Lorenc", "Aleksandra Krasnodębska", "Artur Wilczek", "Katarzyna Dziewulska", "Paula Betscher", "Zofia Cieślińska", "Katarzyna Kowol", "Daria Mikoś", "Maciej Trzciński", "Dawid Krutul", "Marek Kozłowski", "Sławomir Dadas", "Rafał Poświata", "Michał Perełkiewicz", "Małgorzata Grębowiec", "Maciej Kazuła", "Marcin Białas", "Roman Roszko", "Danuta Roszko", "Jurgita Vaičenonienė", "Andrius Utka", "Paweł Levchuk", "Paweł Kowalski", "Irena Prawdzic-Jankowska", "Maciej Ogrodniczuk", "Monika Borys", "Anna Bulińska", "Wiktoria Gumienna", "Witold Kieraś", "Dorota Komosińska", "Katarzyna Krasnowska-Kieraś", "Łukasz Kobyliński", "Martyna Lewandowska", "Marek Łaziński", "Mikołaj Łątkowski", "Dawid Mastalerz", "Beata Milewicz", "Agnieszka Anna Mykowiecka", "Angelika Peljak-Łapińska", "Sandra Penno", "Zuzanna Przybysz", "Michał Rudolf", "Piotr Rybak", "Karolina Saputa", "Aleksandra Tomaszewska", "Aleksander Wawer", "Marcin Woliński", "Joanna Wołoszyn", "Alina Wróblewska", "Bartosz Żuk", "Filip Żarnecki", "Konrad Kaczyński", "Anna Cichosz", "Zuzanna Deckert", "Monika Garnys", "Izabela Grabarczyk", "Wojciech Janowski", "Sylwia Karasińska", "Aleksandra Kujawiak", "Piotr Misztela", "Maria Szymańska", "Karolina Walkusz", "Igor Siek", "Jakub Kwiatkowski", "Piotr Pęzik" ], "abstract": "Large Language Models (LLMs) play a central role in modern artificial intelligence, yet their development has been primarily focused on English, resulting in limited support for other languages. We present PLLuM (Polish Large Language Model), the largest open-source family of foundation models tailored specifically for the Polish language. Developed by a consortium of major Polish research institutions, PLLuM addresses the need for high-quality, transparent, and culturally relevant language models beyond the English-centric commercial landscape. We describe the development process, including the construction of a new 140-billion-token Polish text corpus for pre-training, a 77k custom instructions dataset, and a 100k preference optimization dataset. A key component is a Responsible AI framework that incorporates strict data governance and a hybrid module for output correction and safety filtering. We detail the models' architecture, training procedures, and alignment techniques for both base and instruction-tuned variants, and demonstrate their utility in a downstream task within public administration. By releasing these models publicly, PLLuM aims to foster open research and strengthen sovereign AI technologies in Poland.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "83 pages, 19 figures", "pdf_url": "https://arxiv.org/pdf/2511.03823v1", "published_date": "2025-11-05 19:41:49 UTC", "updated_date": "2025-11-05 19:41:49 UTC" }, { "arxiv_id": "2511.07451v1", "title": "Exploring the Psychometric Validity of AI-Generated Student Responses: A Study on Virtual Personas' Learning Motivation", "authors": [ "Huanxiao Wang" ], "abstract": "This study explores whether large language models (LLMs) can simulate valid student responses for educational measurement. Using GPT -4o, 2000 virtual student personas were generated. Each persona completed the Academic Motivation Scale (AMS). Factor analyses(EFA and CFA) and clustering showed GPT -4o reproduced the AMS structure and distinct motivational subgroups.", "categories": [ "cs.CY", "cs.AI" ], "primary_category": "cs.CY", "comment": "The paper has been accepted as proceedings of Artificial Intelligence in Measurement and Education Conference (AIME-Con) (2025)", "pdf_url": "https://arxiv.org/pdf/2511.07451v1", "published_date": "2025-11-05 19:22:02 UTC", "updated_date": "2025-11-05 19:22:02 UTC" }, { "arxiv_id": "2511.03808v1", "title": "Optimizing Reasoning Efficiency through Prompt Difficulty Prediction", "authors": [ "Bo Zhao", "Berkcan Kapusuzoglu", "Kartik Balasubramaniam", "Sambit Sahu", "Supriyo Chakraborty", "Genta Indra Winata" ], "abstract": "Reasoning language models perform well on complex tasks but are costly to deploy due to their size and long reasoning traces. We propose a routing approach that assigns each problem to the smallest model likely to solve it, reducing compute without sacrificing accuracy. Using intermediate representations from s1.1-32B, we train lightweight predictors of problem difficulty or model correctness to guide routing across a pool of reasoning models. On diverse math benchmarks, routing improves efficiency over random assignment and matches s1.1-32B's performance while using significantly less compute. Our results demonstrate that difficulty-aware routing is effective for cost-efficient deployment of reasoning models.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "NeurIPS 2025 Workshop on Efficient Reasoning", "pdf_url": "https://arxiv.org/pdf/2511.03808v1", "published_date": "2025-11-05 19:14:53 UTC", "updated_date": "2025-11-05 19:14:53 UTC" }, { "arxiv_id": "2511.21695v1", "title": "EvalCards: A Framework for Standardized Evaluation Reporting", "authors": [ "Ruchira Dhar", "Danae Sanchez Villegas", "Antonia Karamolegkou", "Alice Schiavone", "Yifei Yuan", "Xinyi Chen", "Jiaang Li", "Stella Frank", "Laura De Grazia", "Monorama Swain", "Stephanie Brandl", "Daniel Hershcovich", "Anders Søgaard", "Desmond Elliott" ], "abstract": "Evaluation has long been a central concern in NLP, and transparent reporting practices are more critical than ever in today's landscape of rapidly released open-access models. Drawing on a survey of recent work on evaluation and documentation, we identify three persistent shortcomings in current reporting practices: reproducibility, accessibility, and governance. We argue that existing standardization efforts remain insufficient and introduce Evaluation Disclosure Cards (EvalCards) as a path forward. EvalCards are designed to enhance transparency for both researchers and practitioners while providing a practical foundation to meet emerging governance requirements.", "categories": [ "cs.CL", "cs.AI", "cs.CY" ], "primary_category": "cs.CL", "comment": "Under review", "pdf_url": "https://arxiv.org/pdf/2511.21695v1", "published_date": "2025-11-05 19:01:48 UTC", "updated_date": "2025-11-05 19:01:48 UTC" }, { "arxiv_id": "2511.03782v1", "title": "Expert Evaluation of LLM World Models: A High-$T_c$ Superconductivity Case Study", "authors": [ "Haoyu Guo", "Maria Tikhanovskaya", "Paul Raccuglia", "Alexey Vlaskin", "Chris Co", "Daniel J. Liebling", "Scott Ellsworth", "Matthew Abraham", "Elizabeth Dorfman", "N. P. Armitage", "Chunhan Feng", "Antoine Georges", "Olivier Gingras", "Dominik Kiese", "Steven A. Kivelson", "Vadim Oganesyan", "B. J. Ramshaw", "Subir Sachdev", "T. Senthil", "J. M. Tranquada", "Michael P. Brenner", "Subhashini Venugopalan", "Eun-Ah Kim" ], "abstract": "Large Language Models (LLMs) show great promise as a powerful tool for scientific literature exploration. However, their effectiveness in providing scientifically accurate and comprehensive answers to complex questions within specialized domains remains an active area of research. Using the field of high-temperature cuprates as an exemplar, we evaluate the ability of LLM systems to understand the literature at the level of an expert. We construct an expert-curated database of 1,726 scientific papers that covers the history of the field, and a set of 67 expert-formulated questions that probe deep understanding of the literature. We then evaluate six different LLM-based systems for answering these questions, including both commercially available closed models and a custom retrieval-augmented generation (RAG) system capable of retrieving images alongside text. Experts then evaluate the answers of these systems against a rubric that assesses balanced perspectives, factual comprehensiveness, succinctness, and evidentiary support. Among the six systems two using RAG on curated literature outperformed existing closed models across key metrics, particularly in providing comprehensive and well-supported answers. We discuss promising aspects of LLM performances as well as critical short-comings of all the models. The set of expert-formulated questions and the rubric will be valuable for assessing expert level performance of LLM based reasoning systems.", "categories": [ "cond-mat.supr-con", "cond-mat.str-el", "cs.AI" ], "primary_category": "cond-mat.supr-con", "comment": "(v1) 9 pages, 4 figures, with 7-page supporting information. Accepted at the ICML 2025 workshop on Assessing World Models and the Explorations in AI Today workshop at ICML'25", "pdf_url": "https://arxiv.org/pdf/2511.03782v1", "published_date": "2025-11-05 19:00:01 UTC", "updated_date": "2025-11-05 19:00:01 UTC" }, { "arxiv_id": "2511.14778v1", "title": "Learning Interestingness in Automated Mathematical Theory Formation", "authors": [ "George Tsoukalas", "Rahul Saha", "Amitayush Thakur", "Sabrina Reguyal", "Swarat Chaudhuri" ], "abstract": "We take two key steps in automating the open-ended discovery of new mathematical theories, a grand challenge in artificial intelligence. First, we introduce $\\emph{FERMAT}$, a reinforcement learning (RL) environment that models concept discovery and theorem-proving using a set of symbolic actions, opening up a range of RL problems relevant to theory discovery. Second, we explore a specific problem through $\\emph{FERMAT}$: automatically scoring the $\\emph{interestingness}$ of mathematical objects. We investigate evolutionary algorithms for synthesizing nontrivial interestingness measures. In particular, we introduce an LLM-based evolutionary algorithm that features function abstraction, leading to notable improvements in discovering elementary number theory and finite fields over hard-coded baselines. We open-source the $\\emph{FERMAT}$ environment at this URL(https://github.com/trishullab/Fermat).", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "NeurIPS 2025 Spotlight", "pdf_url": "https://arxiv.org/pdf/2511.14778v1", "published_date": "2025-11-05 18:59:17 UTC", "updated_date": "2025-11-05 18:59:17 UTC" }, { "arxiv_id": "2511.03773v2", "title": "Scaling Agent Learning via Experience Synthesis", "authors": [ "Zhaorun Chen", "Zhuokai Zhao", "Kai Zhang", "Bo Liu", "Qi Qi", "Yifan Wu", "Tarun Kalluri", "Sara Cao", "Yuanhao Xiong", "Haibo Tong", "Huaxiu Yao", "Hengduo Li", "Jiacheng Zhu", "Xian Li", "Dawn Song", "Bo Li", "Jason Weston", "Dat Huynh" ], "abstract": "While reinforcement learning (RL) can empower autonomous agents by enabling self-improvement through interaction, its practical adoption remains challenging due to costly rollouts, limited task diversity, unreliable reward signals, and infrastructure complexity, all of which obstruct the collection of scalable experience data. To address these challenges, we introduce DreamGym, the first unified framework designed to synthesize diverse experiences with scalability in mind to enable effective online RL training for autonomous agents. Rather than relying on expensive real-environment rollouts, DreamGym distills environment dynamics into a reasoning-based experience model that derives consistent state transitions and feedback signals through step-by-step reasoning, enabling scalable agent rollout collection for RL. To improve the stability and quality of transitions, DreamGym leverages an experience replay buffer initialized with offline real-world data and continuously enriched with fresh interactions to actively support agent training. To improve knowledge acquisition, DreamGym adaptively generates new tasks that challenge the current agent policy, enabling more effective online curriculum learning. Experiments across diverse environments and agent backbones demonstrate that DreamGym substantially improves RL training, both in fully synthetic settings and in sim-to-real transfer scenarios. On non-RL-ready tasks like WebArena, DreamGym outperforms all baselines by over 30%. And in RL-ready but costly settings, it matches GRPO and PPO performance using only synthetic interactions. When transferring a policy trained purely on synthetic experiences to real-environment RL, DreamGym yields significant additional performance gains while requiring far fewer real-world interactions, providing a scalable warm-start strategy for general-purpose RL.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03773v2", "published_date": "2025-11-05 18:58:48 UTC", "updated_date": "2025-11-10 05:02:36 UTC" }, { "arxiv_id": "2511.03724v2", "title": "Outbidding and Outbluffing Elite Humans: Mastering Liar's Poker via Self-Play and Reinforcement Learning", "authors": [ "Richard Dewey", "Janos Botyanszki", "Ciamac C. Moallemi", "Andrew T. Zheng" ], "abstract": "AI researchers have long focused on poker-like games as a testbed for environments characterized by multi-player dynamics, imperfect information, and reasoning under uncertainty. While recent breakthroughs have matched elite human play at no-limit Texas hold'em, the multi-player dynamics are subdued: most hands converge quickly with only two players engaged through multiple rounds of bidding. In this paper, we present Solly, the first AI agent to achieve elite human play in reduced-format Liar's Poker, a game characterized by extensive multi-player engagement. We trained Solly using self-play with a model-free, actor-critic, deep reinforcement learning algorithm. Solly played at an elite human level as measured by win rate (won over 50% of hands) and equity (money won) in heads-up and multi-player Liar's Poker. Solly also outperformed large language models (LLMs), including those with reasoning abilities, on the same metrics. Solly developed novel bidding strategies, randomized play effectively, and was not easily exploitable by world-class human players.", "categories": [ "cs.AI", "cs.MA" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03724v2", "published_date": "2025-11-05 18:58:18 UTC", "updated_date": "2025-11-07 16:11:08 UTC" }, { "arxiv_id": "2511.03771v1", "title": "Climbing the label tree: Hierarchy-preserving contrastive learning for medical imaging", "authors": [ "Alif Elham Khan" ], "abstract": "Medical image labels are often organized by taxonomies (e.g., organ - tissue - subtype), yet standard self-supervised learning (SSL) ignores this structure. We present a hierarchy-preserving contrastive framework that makes the label tree a first-class training signal and an evaluation target. Our approach introduces two plug-in objectives: Hierarchy-Weighted Contrastive (HWC), which scales positive/negative pair strengths by shared ancestors to promote within-parent coherence, and Level-Aware Margin (LAM), a prototype margin that separates ancestor groups across levels. The formulation is geometry-agnostic and applies to Euclidean and hyperbolic embeddings without architectural changes. Across several benchmarks, including breast histopathology, the proposed objectives consistently improve representation quality over strong SSL baselines while better respecting the taxonomy. We evaluate with metrics tailored to hierarchy faithfulness: HF1 (hierarchical F1), H-Acc (tree-distance-weighted accuracy), and parent-distance violation rate. We also report top-1 accuracy for completeness. Ablations show that HWC and LAM are effective even without curvature, and combining them yields the most taxonomy-aligned representations. Taken together, these results provide a simple, general recipe for learning medical image representations that respect the label tree and advance both performance and interpretability in hierarchy-rich domains.", "categories": [ "q-bio.QM", "cs.AI", "cs.LG" ], "primary_category": "q-bio.QM", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03771v1", "published_date": "2025-11-05 18:53:25 UTC", "updated_date": "2025-11-05 18:53:25 UTC" }, { "arxiv_id": "2511.03718v1", "title": "Grounded Misunderstandings in Asymmetric Dialogue: A Perspectivist Annotation Scheme for MapTask", "authors": [ "Nan Li", "Albert Gatt", "Massimo Poesio" ], "abstract": "Collaborative dialogue relies on participants incrementally establishing common ground, yet in asymmetric settings they may believe they agree while referring to different entities. We introduce a perspectivist annotation scheme for the HCRC MapTask corpus (Anderson et al., 1991) that separately captures speaker and addressee grounded interpretations for each reference expression, enabling us to trace how understanding emerges, diverges, and repairs over time. Using a scheme-constrained LLM annotation pipeline, we obtain 13k annotated reference expressions with reliability estimates and analyze the resulting understanding states. The results show that full misunderstandings are rare once lexical variants are unified, but multiplicity discrepancies systematically induce divergences, revealing how apparent grounding can mask referential misalignment. Our framework provides both a resource and an analytic lens for studying grounded misunderstanding and for evaluating (V)LLMs' capacity to model perspective-dependent grounding in collaborative dialogue.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "11 pages, 3 figures, 5 tables; under review", "pdf_url": "https://arxiv.org/pdf/2511.03718v1", "published_date": "2025-11-05 18:52:28 UTC", "updated_date": "2025-11-05 18:52:28 UTC" }, { "arxiv_id": "2511.14777v1", "title": "The Illusion of Procedural Reasoning: Measuring Long-Horizon FSM Execution in LLMs", "authors": [ "Mahdi Samiei", "Mahdi Mansouri", "Mahdieh Soleymani Baghshah" ], "abstract": "Large language models (LLMs) have achieved remarkable results on tasks framed as reasoning problems, yet their true ability to perform procedural reasoning, executing multi-step, rule-based computations remains unclear. Unlike algorithmic systems, which can deterministically execute long-horizon symbolic procedures, LLMs often degrade under extended reasoning chains, but there is no controlled, interpretable benchmark to isolate and measure this collapse. We introduce Finite-State Machine (FSM) Execution as a minimal, fully interpretable framework for evaluating the procedural reasoning capacity of LLMs. In our setup, the model is given an explicit FSM definition and must execute it step-by-step given input actions, maintaining state consistency over multiple turns. This task requires no world knowledge, only faithful application of deterministic transition rules, making it a direct probe of the model's internal procedural fidelity. We measure both Turn Accuracy and Task Accuracy to disentangle immediate computation from cumulative state maintenance. Empirical results reveal systematic degradation as task horizon or branching complexity increases. Models perform significantly worse when rule retrieval involves high branching factors than when memory span is long. Larger models show improved local accuracy but remain brittle under multi-step reasoning unless explicitly prompted to externalize intermediate steps. FSM-based evaluation offers a transparent, complexity-controlled probe for diagnosing this failure mode and guiding the design of inductive biases that enable genuine long-horizon procedural competence. By grounding reasoning in measurable execution fidelity rather than surface correctness, this work helps establish a rigorous experimental foundation for understanding and improving the algorithmic reliability of LLMs.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.14777v1", "published_date": "2025-11-05 18:44:47 UTC", "updated_date": "2025-11-05 18:44:47 UTC" }, { "arxiv_id": "2511.03697v1", "title": "AnaFlow: Agentic LLM-based Workflow for Reasoning-Driven Explainable and Sample-Efficient Analog Circuit Sizing", "authors": [ "Mohsen Ahmadzadeh", "Kaichang Chen", "Georges Gielen" ], "abstract": "Analog/mixed-signal circuits are key for interfacing electronics with the physical world. Their design, however, remains a largely handcrafted process, resulting in long and error-prone design cycles. While the recent rise of AI-based reinforcement learning and generative AI has created new techniques to automate this task, the need for many time-consuming simulations is a critical bottleneck hindering the overall efficiency. Furthermore, the lack of explainability of the resulting design solutions hampers widespread adoption of the tools. To address these issues, a novel agentic AI framework for sample-efficient and explainable analog circuit sizing is presented. It employs a multi-agent workflow where specialized Large Language Model (LLM)-based agents collaborate to interpret the circuit topology, to understand the design goals, and to iteratively refine the circuit's design parameters towards the target goals with human-interpretable reasoning. The adaptive simulation strategy creates an intelligent control that yields a high sample efficiency. The AnaFlow framework is demonstrated for two circuits of varying complexity and is able to complete the sizing task fully automatically, differently from pure Bayesian optimization and reinforcement learning approaches. The system learns from its optimization history to avoid past mistakes and to accelerate convergence. The inherent explainability makes this a powerful tool for analog design space exploration and a new paradigm in analog EDA, where AI agents serve as transparent design assistants.", "categories": [ "cs.LG", "cs.AI", "cs.AR" ], "primary_category": "cs.LG", "comment": "This article was accepted by 2025 International Conference on Computer-Aided Design (ICCAD 2025) and was presented in Munich, October 2025", "pdf_url": "https://arxiv.org/pdf/2511.03697v1", "published_date": "2025-11-05 18:24:01 UTC", "updated_date": "2025-11-05 18:24:01 UTC" }, { "arxiv_id": "2511.03690v1", "title": "The OpenHands Software Agent SDK: A Composable and Extensible Foundation for Production Agents", "authors": [ "Xingyao Wang", "Simon Rosenberg", "Juan Michelini", "Calvin Smith", "Hoang Tran", "Engel Nyst", "Rohit Malhotra", "Xuhui Zhou", "Valerie Chen", "Robert Brennan", "Graham Neubig" ], "abstract": "Agents are now used widely in the process of software development, but building production-ready software engineering agents is a complex task. Deploying software agents effectively requires flexibility in implementation and experimentation, reliable and secure execution, and interfaces for users to interact with agents. In this paper, we present the OpenHands Software Agent SDK, a toolkit for implementing software development agents that satisfy these desiderata. This toolkit is a complete architectural redesign of the agent components of the popular OpenHands framework for software development agents, which has 64k+ GitHub stars. To achieve flexibility, we design a simple interface for implementing agents that requires only a few lines of code in the default case, but is easily extensible to more complex, full-featured agents with features such as custom tools, memory management, and more. For security and reliability, it delivers seamless local-to-remote execution portability, integrated REST/WebSocket services. For interaction with human users, it can connect directly to a variety of interfaces, such as visual workspaces (VS Code, VNC, browser), command-line interfaces, and APIs. Compared with existing SDKs from OpenAI, Claude, and Google, OpenHands uniquely integrates native sandboxed execution, lifecycle control, model-agnostic multi-LLM routing, and built-in security analysis. Empirical results on SWE-Bench Verified and GAIA benchmarks demonstrate strong performance. Put together, these elements allow the OpenHands Software Agent SDK to provide a practical foundation for prototyping, unlocking new classes of custom applications, and reliably deploying agents at scale.", "categories": [ "cs.SE", "cs.AI" ], "primary_category": "cs.SE", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03690v1", "published_date": "2025-11-05 18:16:44 UTC", "updated_date": "2025-11-05 18:16:44 UTC" }, { "arxiv_id": "2511.03685v1", "title": "Structured Matrix Scaling for Multi-Class Calibration", "authors": [ "Eugène Berta", "David Holzmüller", "Michael I. Jordan", "Francis Bach" ], "abstract": "Post-hoc recalibration methods are widely used to ensure that classifiers provide faithful probability estimates. We argue that parametric recalibration functions based on logistic regression can be motivated from a simple theoretical setting for both binary and multiclass classification. This insight motivates the use of more expressive calibration methods beyond standard temperature scaling. For multi-class calibration however, a key challenge lies in the increasing number of parameters introduced by more complex models, often coupled with limited calibration data, which can lead to overfitting. Through extensive experiments, we demonstrate that the resulting bias-variance tradeoff can be effectively managed by structured regularization, robust preprocessing and efficient optimization. The resulting methods lead to substantial gains over existing logistic-based calibration techniques. We provide efficient and easy-to-use open-source implementations of our methods, making them an attractive alternative to common temperature, vector, and matrix scaling implementations.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03685v1", "published_date": "2025-11-05 18:09:14 UTC", "updated_date": "2025-11-05 18:09:14 UTC" }, { "arxiv_id": "2511.03684v1", "title": "Simulation-Based Validation of an Integrated 4D/5D Digital-Twin Framework for Predictive Construction Control", "authors": [ "Atena Khoshkonesh", "Mohsen Mohammadagha", "Navid Ebrahimi" ], "abstract": "Persistent cost and schedule deviations remain a major challenge in the U.S. construction industry, revealing the limitations of deterministic CPM and static document-based estimating. This study presents an integrated 4D/5D digital-twin framework that couples Building Information Modeling (BIM) with natural-language processing (NLP)-based cost mapping, computer-vision (CV)-driven progress measurement, Bayesian probabilistic CPM updating, and deep-reinforcement-learning (DRL) resource-leveling. A nine-month case implementation on a Dallas-Fort Worth mid-rise project demonstrated measurable gains in accuracy and efficiency: 43% reduction in estimating labor, 6% reduction in overtime, and 30% project-buffer utilization, while maintaining an on-time finish at 128 days within P50-P80 confidence bounds. The digital-twin sandbox also enabled real-time \"what-if\" forecasting and traceable cost-schedule alignment through a 5D knowledge graph. Findings confirm that integrating AI-based analytics with probabilistic CPM and DRL enhances forecasting precision, transparency, and control resilience. The validated workflow establishes a practical pathway toward predictive, adaptive, and auditable construction management.", "categories": [ "cs.CE", "cs.AI", "cs.LG", "eess.SY" ], "primary_category": "cs.CE", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03684v1", "published_date": "2025-11-05 18:07:03 UTC", "updated_date": "2025-11-05 18:07:03 UTC" }, { "arxiv_id": "2511.03675v1", "title": "Whisper Leak: a side-channel attack on Large Language Models", "authors": [ "Geoff McDonald", "Jonathan Bar Or" ], "abstract": "Large Language Models (LLMs) are increasingly deployed in sensitive domains including healthcare, legal services, and confidential communications, where privacy is paramount. This paper introduces Whisper Leak, a side-channel attack that infers user prompt topics from encrypted LLM traffic by analyzing packet size and timing patterns in streaming responses. Despite TLS encryption protecting content, these metadata patterns leak sufficient information to enable topic classification. We demonstrate the attack across 28 popular LLMs from major providers, achieving near-perfect classification (often >98% AUPRC) and high precision even at extreme class imbalance (10,000:1 noise-to-target ratio). For many models, we achieve 100% precision in identifying sensitive topics like \"money laundering\" while recovering 5-20% of target conversations. This industry-wide vulnerability poses significant risks for users under network surveillance by ISPs, governments, or local adversaries. We evaluate three mitigation strategies - random padding, token batching, and packet injection - finding that while each reduces attack effectiveness, none provides complete protection. Through responsible disclosure, we have collaborated with providers to implement initial countermeasures. Our findings underscore the need for LLM providers to address metadata leakage as AI systems handle increasingly sensitive information.", "categories": [ "cs.CR", "cs.AI" ], "primary_category": "cs.CR", "comment": "14 pages, 7 figures", "pdf_url": "https://arxiv.org/pdf/2511.03675v1", "published_date": "2025-11-05 17:47:46 UTC", "updated_date": "2025-11-05 17:47:46 UTC" }, { "arxiv_id": "2511.03670v1", "title": "DQN Performance with Epsilon Greedy Policies and Prioritized Experience Replay", "authors": [ "Daniel Perkins", "Oscar J. Escobar", "Luke Green" ], "abstract": "We present a detailed study of Deep Q-Networks in finite environments, emphasizing the impact of epsilon-greedy exploration schedules and prioritized experience replay. Through systematic experimentation, we evaluate how variations in epsilon decay schedules affect learning efficiency, convergence behavior, and reward optimization. We investigate how prioritized experience replay leads to faster convergence and higher returns and show empirical results comparing uniform, no replay, and prioritized strategies across multiple simulations. Our findings illuminate the trade-offs and interactions between exploration strategies and memory management in DQN training, offering practical recommendations for robust reinforcement learning in resource-constrained settings.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "10 pages, 8 figures", "pdf_url": "https://arxiv.org/pdf/2511.03670v1", "published_date": "2025-11-05 17:36:30 UTC", "updated_date": "2025-11-05 17:36:30 UTC" }, { "arxiv_id": "2511.03656v1", "title": "ChiMDQA: Towards Comprehensive Chinese Document QA with Fine-grained Evaluation", "authors": [ "Jing Gao", "Shutiao Luo", "Yumeng Liu", "Yuanming Li", "Hongji Zeng" ], "abstract": "With the rapid advancement of natural language processing (NLP) technologies, the demand for high-quality Chinese document question-answering datasets is steadily growing. To address this issue, we present the Chinese Multi-Document Question Answering Dataset(ChiMDQA), specifically designed for downstream business scenarios across prevalent domains including academic, education, finance, law, medical treatment, and news. ChiMDQA encompasses long-form documents from six distinct fields, consisting of 6,068 rigorously curated, high-quality question-answer (QA) pairs further classified into ten fine-grained categories. Through meticulous document screening and a systematic question-design methodology, the dataset guarantees both diversity and high quality, rendering it applicable to various NLP tasks such as document comprehension, knowledge extraction, and intelligent QA systems. Additionally, this paper offers a comprehensive overview of the dataset's design objectives, construction methodologies, and fine-grained evaluation system, supplying a substantial foundation for future research and practical applications in Chinese QA. The code and data are available at: https://anonymous.4open.science/r/Foxit-CHiMDQA/.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "13 pages, 6 tables, 4 figures, accepted by ICANN 2025", "pdf_url": "https://arxiv.org/pdf/2511.03656v1", "published_date": "2025-11-05 17:13:14 UTC", "updated_date": "2025-11-05 17:13:14 UTC" }, { "arxiv_id": "2511.03643v2", "title": "Explaining Human Choice Probabilities with Simple Vector Representations", "authors": [ "Peter DiBerardino", "Britt Anderson" ], "abstract": "When people pursue rewards in stochastic environments, they often match their choice frequencies to the observed target frequencies, even when this policy is demonstrably sub-optimal. We used a ``hide and seek'' task to evaluate this behavior under conditions where pursuit (seeking) could be toggled to avoidance (hiding), while leaving the probability distribution fixed, or varying complexity by changing the number of possible choices. We developed a model for participant choice built from choice frequency histograms treated as vectors. We posited the existence of a probability antimatching strategy for avoidance (hiding) rounds, and formalized this as a vector reflection of probability matching. We found that only two basis policies: matching/antimatching and maximizing/minimizing were sufficient to account for participant choices across a range of room numbers and opponent probability distributions. This schema requires only that people have the ability to remember the relative frequency of the different outcomes. With this knowledge simple operations can construct the maximizing and minimizing policies as well as matching and antimatching strategies. A mixture of these two policies captures human choice patterns in a stochastic environment.", "categories": [ "q-bio.NC", "cs.AI" ], "primary_category": "q-bio.NC", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03643v2", "published_date": "2025-11-05 17:03:03 UTC", "updated_date": "2025-11-10 18:36:37 UTC" }, { "arxiv_id": "2511.03641v1", "title": "Watermarking Large Language Models in Europe: Interpreting the AI Act in Light of Technology", "authors": [ "Thomas Souverain" ], "abstract": "To foster trustworthy Artificial Intelligence (AI) within the European Union, the AI Act requires providers to mark and detect the outputs of their general-purpose models. The Article 50 and Recital 133 call for marking methods that are ''sufficiently reliable, interoperable, effective and robust''. Yet, the rapidly evolving and heterogeneous landscape of watermarks for Large Language Models (LLMs) makes it difficult to determine how these four standards can be translated into concrete and measurable evaluations. Our paper addresses this challenge, anchoring the normativity of European requirements in the multiplicity of watermarking techniques. Introducing clear and distinct concepts on LLM watermarking, our contribution is threefold. (1) Watermarking Categorisation: We propose an accessible taxonomy of watermarking methods according to the stage of the LLM lifecycle at which they are applied - before, during, or after training, and during next-token distribution or sampling. (2) Watermarking Evaluation: We interpret the EU AI Act's requirements by mapping each criterion with state-of-the-art evaluations on robustness and detectability of the watermark, and of quality of the LLM. Since interoperability remains largely untheorised in LLM watermarking research, we propose three normative dimensions to frame its assessment. (3) Watermarking Comparison: We compare current watermarking methods for LLMs against the operationalised European criteria and show that no approach yet satisfies all four standards. Encouraged by emerging empirical tests, we recommend further research into watermarking directly embedded within the low-level architecture of LLMs.", "categories": [ "cs.CR", "cs.AI", "cs.CL", "cs.CY" ], "primary_category": "cs.CR", "comment": "17 pages, 2 Tables and 2 Pictures", "pdf_url": "https://arxiv.org/pdf/2511.03641v1", "published_date": "2025-11-05 17:00:39 UTC", "updated_date": "2025-11-05 17:00:39 UTC" }, { "arxiv_id": "2511.03628v1", "title": "LiveTradeBench: Seeking Real-World Alpha with Large Language Models", "authors": [ "Haofei Yu", "Fenghai Li", "Jiaxuan You" ], "abstract": "Large language models (LLMs) achieve strong performance across benchmarks--from knowledge quizzes and math reasoning to web-agent tasks--but these tests occur in static settings, lacking real dynamics and uncertainty. Consequently, they evaluate isolated reasoning or problem-solving rather than decision-making under uncertainty. To address this, we introduce LiveTradeBench, a live trading environment for evaluating LLM agents in realistic and evolving markets. LiveTradeBench follows three design principles: (i) Live data streaming of market prices and news, eliminating dependence on offline backtesting and preventing information leakage while capturing real-time uncertainty; (ii) a portfolio-management abstraction that extends control from single-asset actions to multi-asset allocation, integrating risk management and cross-asset reasoning; and (iii) multi-market evaluation across structurally distinct environments--U.S. stocks and Polymarket prediction markets--differing in volatility, liquidity, and information flow. At each step, an agent observes prices, news, and its portfolio, then outputs percentage allocations that balance risk and return. Using LiveTradeBench, we run 50-day live evaluations of 21 LLMs across families. Results show that (1) high LMArena scores do not imply superior trading outcomes; (2) models display distinct portfolio styles reflecting risk appetite and reasoning dynamics; and (3) some LLMs effectively leverage live signals to adapt decisions. These findings expose a gap between static evaluation and real-world competence, motivating benchmarks that test sequential decision making and consistency under live uncertainty.", "categories": [ "q-fin.TR", "cs.AI", "cs.CE", "cs.CL" ], "primary_category": "q-fin.TR", "comment": "16 pages", "pdf_url": "https://arxiv.org/pdf/2511.03628v1", "published_date": "2025-11-05 16:47:26 UTC", "updated_date": "2025-11-05 16:47:26 UTC" }, { "arxiv_id": "2511.03617v1", "title": "Visualization Biases MLLM's Decision Making in Network Data Tasks", "authors": [ "Timo Brand", "Henry Förster", "Stephen G. Kobourov", "Jacob Miller" ], "abstract": "We evaluate how visualizations can influence the judgment of MLLMs about the presence or absence of bridges in a network. We show that the inclusion of visualization improves confidence over a structured text-based input that could theoretically be helpful for answering the question. On the other hand, we observe that standard visualization techniques create a strong bias towards accepting or refuting the presence of a bridge -- independently of whether or not a bridge actually exists in the network. While our results indicate that the inclusion of visualization techniques can effectively influence the MLLM's judgment without compromising its self-reported confidence, they also imply that practitioners must be careful of allowing users to include visualizations in generative AI applications so as to avoid undesired hallucinations.", "categories": [ "cs.GR", "cs.AI" ], "primary_category": "cs.GR", "comment": "This manuscript was presented at VIS x GenAI, a workshop co-located with IEEE VIS 2025", "pdf_url": "https://arxiv.org/pdf/2511.03617v1", "published_date": "2025-11-05 16:34:12 UTC", "updated_date": "2025-11-05 16:34:12 UTC" }, { "arxiv_id": "2511.03601v2", "title": "Step-Audio-EditX Technical Report", "authors": [ "Chao Yan", "Boyong Wu", "Peng Yang", "Pengfei Tan", "Guoqiang Hu", "Li Xie", "Yuxin Zhang", "Xiangyu", "Zhang", "Fei Tian", "Xuerui Yang", "Xiangyu Zhang", "Daxin Jiang", "Shuchang Zhou", "Gang Yu" ], "abstract": "We present Step-Audio-EditX, the first open-source LLM-based audio model excelling at expressive and iterative audio editing encompassing emotion, speaking style, and paralinguistics alongside robust zero-shot text-to-speech (TTS) capabilities. Our core innovation lies in leveraging only large-margin synthetic data, which circumvents the need for embedding-based priors or auxiliary modules. This large-margin learning approach enables both iterative control and high expressivity across voices, and represents a fundamental pivot from the conventional focus on representation-level disentanglement. Evaluation results demonstrate that Step-Audio-EditX surpasses both MiniMax-2.6-hd and Doubao-Seed-TTS-2.0 in emotion editing and other fine-grained control tasks.", "categories": [ "cs.CL", "cs.AI", "cs.HC", "cs.SD", "eess.AS" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03601v2", "published_date": "2025-11-05 16:22:19 UTC", "updated_date": "2025-11-19 04:56:09 UTC" }, { "arxiv_id": "2511.03586v1", "title": "PerfDojo: Automated ML Library Generation for Heterogeneous Architectures", "authors": [ "Andrei Ivanov", "Siyuan Shen", "Gioele Gottardo", "Marcin Chrapek", "Afif Boudaoud", "Timo Schneider", "Luca Benini", "Torsten Hoefler" ], "abstract": "The increasing complexity of machine learning models and the proliferation of diverse hardware architectures (CPUs, GPUs, accelerators) make achieving optimal performance a significant challenge. Heterogeneity in instruction sets, specialized kernel requirements for different data types and model features (e.g., sparsity, quantization), and architecture-specific optimizations complicate performance tuning. Manual optimization is resource-intensive, while existing automatic approaches often rely on complex hardware-specific heuristics and uninterpretable intermediate representations, hindering performance portability. We introduce PerfLLM, a novel automatic optimization methodology leveraging Large Language Models (LLMs) and Reinforcement Learning (RL). Central to this is PerfDojo, an environment framing optimization as an RL game using a human-readable, mathematically-inspired code representation that guarantees semantic validity through transformations. This allows effective optimization without prior hardware knowledge, facilitating both human analysis and RL agent training. We demonstrate PerfLLM's ability to achieve significant performance gains across diverse CPU (x86, Arm, RISC-V) and GPU architectures.", "categories": [ "cs.PF", "cs.AI" ], "primary_category": "cs.PF", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03586v1", "published_date": "2025-11-05 16:05:26 UTC", "updated_date": "2025-11-05 16:05:26 UTC" }, { "arxiv_id": "2511.03578v1", "title": "Learning Under Laws: A Constraint-Projected Neural PDE Solver that Eliminates Hallucinations", "authors": [ "Mainak Singha" ], "abstract": "Neural networks can approximate solutions to partial differential equations, but they often break the very laws they are meant to model-creating mass from nowhere, drifting shocks, or violating conservation and entropy. We address this by training within the laws of physics rather than beside them. Our framework, called Constraint-Projected Learning (CPL), keeps every update physically admissible by projecting network outputs onto the intersection of constraint sets defined by conservation, Rankine-Hugoniot balance, entropy, and positivity. The projection is differentiable and adds only about 10% computational overhead, making it fully compatible with back-propagation. We further stabilize training with total-variation damping (TVD) to suppress small oscillations and a rollout curriculum that enforces consistency over long prediction horizons. Together, these mechanisms eliminate both hard and soft violations: conservation holds at machine precision, total-variation growth vanishes, and entropy and error remain bounded. On Burgers and Euler systems, CPL produces stable, physically lawful solutions without loss of accuracy. Instead of hoping neural solvers will respect physics, CPL makes that behavior an intrinsic property of the learning process.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "25 pages, 2 figures. This work introduces Constraint-Projected Learning (CPL)- a framework for neural PDE solvers that enforces physical conservation laws during training to eliminate hallucinated, non-physical solutions. Feedback is welcome. Not under review elsewhere", "pdf_url": "https://arxiv.org/pdf/2511.03578v1", "published_date": "2025-11-05 16:01:19 UTC", "updated_date": "2025-11-05 16:01:19 UTC" }, { "arxiv_id": "2511.03576v3", "title": "Multi-User Personalisation in Human-Robot Interaction: Resolving Preference Conflicts Using Gradual Argumentation", "authors": [ "Aniol Civit", "Antonio Andriella", "Carles Sierra", "Guillem Alenyà" ], "abstract": "While personalisation in Human-Robot Interaction (HRI) has advanced significantly, most existing approaches focus on single-user adaptation, overlooking scenarios involving multiple stakeholders with potentially conflicting preferences. To address this, we propose the Multi-User Preferences Quantitative Bipolar Argumentation Framework (MUP-QBAF), a novel multi-user personalisation framework based on Quantitative Bipolar Argumentation Frameworks (QBAFs) that explicitly models and resolves multi-user preference conflicts. Unlike prior work in Argumentation Frameworks, which typically assumes static inputs, our approach is tailored to robotics: it incorporates both users' arguments and the robot's dynamic observations of the environment, allowing the system to adapt over time and respond to changing contexts. Preferences, both positive and negative, are represented as arguments whose strength is recalculated iteratively based on new information. The framework's properties and capabilities are presented and validated through a realistic case study, where an assistive robot mediates between the conflicting preferences of a caregiver and a care recipient during a frailty assessment task. This evaluation further includes a sensitivity analysis of argument base scores, demonstrating how preference outcomes can be shaped by user input and contextual observations. By offering a transparent, structured, and context-sensitive approach to resolving competing user preferences, this work advances the field of multi-user HRI. It provides a principled alternative to data-driven methods, enabling robots to navigate conflicts in real-world environments.", "categories": [ "cs.RO", "cs.AI" ], "primary_category": "cs.RO", "comment": "Preprint submitted to a journal", "pdf_url": "https://arxiv.org/pdf/2511.03576v3", "published_date": "2025-11-05 15:59:30 UTC", "updated_date": "2026-01-12 11:53:27 UTC" }, { "arxiv_id": "2511.03565v1", "title": "Imitation Learning in the Deep Learning Era: A Novel Taxonomy and Recent Advances", "authors": [ "Iason Chrysomallis", "Georgios Chalkiadakis" ], "abstract": "Imitation learning (IL) enables agents to acquire skills by observing and replicating the behavior of one or multiple experts. In recent years, advances in deep learning have significantly expanded the capabilities and scalability of imitation learning across a range of domains, where expert data can range from full state-action trajectories to partial observations or unlabeled sequences. Alongside this growth, novel approaches have emerged, with new methodologies being developed to address longstanding challenges such as generalization, covariate shift, and demonstration quality. In this survey, we review the latest advances in imitation learning research, highlighting recent trends, methodological innovations, and practical applications. We propose a novel taxonomy that is distinct from existing categorizations to better reflect the current state of the IL research stratum and its trends. Throughout the survey, we critically examine the strengths, limitations, and evaluation practices of representative works, and we outline key challenges and open directions for future research.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03565v1", "published_date": "2025-11-05 15:47:29 UTC", "updated_date": "2025-11-05 15:47:29 UTC" }, { "arxiv_id": "2511.03559v1", "title": "AILA--First Experiments with Localist Language Models", "authors": [ "Joachim Diederich" ], "abstract": "This paper presents the first empirical demonstration of controllable locality in transformer language models, a novel architectural framework that enables continuous control over the degree of representation localization through a tunable locality dial parameter. Unlike traditional language models that rely exclusively on distributed representations, our approach allows dynamic interpolation between highly interpretable localist encodings and efficient distributed representations without requiring model retraining. We conducted experiments on the WikiText corpus using a two-layer transformer architecture, systematically varying the locality parameter λ across the full spectrum from 1.0 (fully localist) to 0.0 (fully distributed). Our results demonstrate that localist configurations achieve dramatically lower attention entropy, with λ = 1.0 yielding 5.36 bits compared to 7.18 bits at λ = 0.0, while maintaining substantially higher pointer fidelity scores reflecting stronger alignment with rule-specified targets. Prediction experiments reveal that intermediate locality values optimize the tradeoff between interpretability and performance, with λ = 0.6 achieving test perplexity of 4.65 and accuracy of 84.7%. These findings establish that localist language models provide a practical framework for applications in regulated domains requiring both transparency and capability, offering precise mathematical control over the interpretability-performance spectrum through explicit penalty thresholds and information-theoretic design principles.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03559v1", "published_date": "2025-11-05 15:43:54 UTC", "updated_date": "2025-11-05 15:43:54 UTC" }, { "arxiv_id": "2511.03553v1", "title": "MultiZebraLogic: A Multilingual Logical Reasoning Benchmark", "authors": [ "Sofie Helene Bruun", "Dan Saattrup Smart" ], "abstract": "Measuring the full abilities of large language models (LLMs) requires benchmarks representing multiple tasks. We aim to create large, high-quality datasets for comparison of logical reasoning skills across several languages and of suitable difficulty for LLMs of various reasoning ability. We explore multiple ways of increasing difficulty. We generate zebra puzzles in multiple languages, themes, sizes and including 14 different clue types and 8 red herring types (uninformative clues). We find puzzle sizes 2x3 and 4x5 are sufficiently challenging for GPT-4o mini (a non-reasoning model) and o3-mini (a reasoning model), respectively. Including 5 red herrings decreases o3-mini puzzle-level accuracy on 4x5 puzzles by 15$\\pm$7 %. Scores of o3-mini on 4x5 puzzles are not significantly affected by use of English vs. Danish or the common houses theme vs. the country-specific smoerrebroed theme. We find no correlation between difficulty and the selected clue types. Datasets of 128+1024 puzzles are published as MultiZebraLogic in each of nine Germanic languages for sizes 2x3 and 4x5. We publish code for puzzle generation, designed for adaptablity into more languages and themes.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "Submitted to LREC 2026", "pdf_url": "https://arxiv.org/pdf/2511.03553v1", "published_date": "2025-11-05 15:34:48 UTC", "updated_date": "2025-11-05 15:34:48 UTC" }, { "arxiv_id": "2511.03549v1", "title": "Uncovering Code Insights: Leveraging GitHub Artifacts for Deeper Code Understanding", "authors": [ "Ziv Nevo", "Orna Raz", "Karen Yorav" ], "abstract": "Understanding the purpose of source code is a critical task in software maintenance, onboarding, and modernization. While large language models (LLMs) have shown promise in generating code explanations, they often lack grounding in the broader software engineering context. We propose a novel approach that leverages natural language artifacts from GitHub -- such as pull request descriptions, issue descriptions and discussions, and commit messages -- to enhance LLM-based code understanding. Our system consists of three components: one that extracts and structures relevant GitHub context, another that uses this context to generate high-level explanations of the code's purpose, and a third that validates the explanation. We implemented this as a standalone tool, as well as a server within the Model Context Protocol (MCP), enabling integration with other AI-assisted development tools. Our main use case is that of enhancing a standard LLM-based code explanation with code insights that our system generates. To evaluate explanations' quality, we conducted a small scale user study, with developers of several open projects, as well as developers of proprietary projects. Our user study indicates that when insights are generated they often are helpful and non trivial, and are free from hallucinations.", "categories": [ "cs.SE", "cs.AI" ], "primary_category": "cs.SE", "comment": "7 pages, 6 figures, to be published in AISM 2025, see https://aism25.github.io/aism25/", "pdf_url": "https://arxiv.org/pdf/2511.03549v1", "published_date": "2025-11-05 15:31:42 UTC", "updated_date": "2025-11-05 15:31:42 UTC" }, { "arxiv_id": "2511.03545v1", "title": "Explaining Decisions in ML Models: a Parameterized Complexity Analysis (Part I)", "authors": [ "Sebastian Ordyniak", "Giacomo Paesani", "Mateusz Rychlicki", "Stefan Szeider" ], "abstract": "This paper presents a comprehensive theoretical investigation into the parameterized complexity of explanation problems in various machine learning (ML) models. Contrary to the prevalent black-box perception, our study focuses on models with transparent internal mechanisms. We address two principal types of explanation problems: abductive and contrastive, both in their local and global variants. Our analysis encompasses diverse ML models, including Decision Trees, Decision Sets, Decision Lists, Boolean Circuits, and ensembles thereof, each offering unique explanatory challenges. This research fills a significant gap in explainable AI (XAI) by providing a foundational understanding of the complexities of generating explanations for these models. This work provides insights vital for further research in the domain of XAI, contributing to the broader discourse on the necessity of transparency and accountability in AI systems.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "Part I of a greatly enhanced version of https://doi.org/10.24963/kr.2024/53, whose full version is available on arXiv under https://doi.org/10.48550/arXiv.2407.15780", "pdf_url": "https://arxiv.org/pdf/2511.03545v1", "published_date": "2025-11-05 15:25:07 UTC", "updated_date": "2025-11-05 15:25:07 UTC" }, { "arxiv_id": "2511.03542v1", "title": "SOLVE-Med: Specialized Orchestration for Leading Vertical Experts across Medical Specialties", "authors": [ "Roberta Di Marino", "Giovanni Dioguardi", "Antonio Romano", "Giuseppe Riccio", "Mariano Barone", "Marco Postiglione", "Flora Amato", "Vincenzo Moscato" ], "abstract": "Medical question answering systems face deployment challenges including hallucinations, bias, computational demands, privacy concerns, and the need for specialized expertise across diverse domains. Here, we present SOLVE-Med, a multi-agent architecture combining domain-specialized small language models for complex medical queries. The system employs a Router Agent for dynamic specialist selection, ten specialized models (1B parameters each) fine-tuned on specific medical domains, and an Orchestrator Agent that synthesizes responses. Evaluated on Italian medical forum data across ten specialties, SOLVE-Med achieves superior performance with ROUGE-1 of 0.301 and BERTScore F1 of 0.697, outperforming standalone models up to 14B parameters while enabling local deployment. Our code is publicly available on GitHub: https://github.com/PRAISELab-PicusLab/SOLVE-Med.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03542v1", "published_date": "2025-11-05 15:15:35 UTC", "updated_date": "2025-11-05 15:15:35 UTC" }, { "arxiv_id": "2511.05597v1", "title": "From Prompts to Power: Measuring the Energy Footprint of LLM Inference", "authors": [ "Francisco Caravaca", "Ángel Cuevas", "Rubén Cuevas" ], "abstract": "The rapid expansion of Large Language Models (LLMs) has introduced unprecedented energy demands, extending beyond training to large-scale inference workloads that often dominate total lifecycle consumption. Deploying these models requires energy-intensive GPU infrastructure, and in some cases has even prompted plans to power data centers with nuclear energy. Despite this growing relevance, systematic analyses of inference energy consumption remain limited. In this work, we present a large-scale measurement-based study comprising over 32,500 measurements across 21 GPU configurations and 155 model architectures, from small open-source models to frontier systems. Using the vLLM inference engine, we quantify energy usage at the prompt level and identify how architectural and operational factors shape energy demand. Building on these insights, we develop a predictive model that accurately estimates inference energy consumption across unseen architectures and hardware, and implement it as a browser extension to raise awareness of the environmental impact of generative AI.", "categories": [ "cs.AI", "cs.LG" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.05597v1", "published_date": "2025-11-05 15:06:46 UTC", "updated_date": "2025-11-05 15:06:46 UTC" }, { "arxiv_id": "2511.03531v1", "title": "Efficient Neural Networks with Discrete Cosine Transform Activations", "authors": [ "Marc Martinez-Gost", "Sara Pepe", "Ana Pérez-Neira", "Miguel Ángel Lagunas" ], "abstract": "In this paper, we extend our previous work on the Expressive Neural Network (ENN), a multilayer perceptron with adaptive activation functions parametrized using the Discrete Cosine Transform (DCT). Building upon previous work that demonstrated the strong expressiveness of ENNs with compact architectures, we now emphasize their efficiency, interpretability and pruning capabilities. The DCT-based parameterization provides a structured and decorrelated representation that reveals the functional role of each neuron and allows direct identification of redundant components. Leveraging this property, we propose an efficient pruning strategy that removes unnecessary DCT coefficients with negligible or no loss in performance. Experimental results across classification and implicit neural representation tasks confirm that ENNs achieve state-of-the-art accuracy while maintaining a low number of parameters. Furthermore, up to 40% of the activation coefficients can be safely pruned, thanks to the orthogonality and bounded nature of the DCT basis. Overall, these findings demonstrate that the ENN framework offers a principled integration of signal processing concepts into neural network design, achieving a balanced trade-off between expressiveness, compactness, and interpretability.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "Paper submitted to WSEAS Signal Processing Journal", "pdf_url": "https://arxiv.org/pdf/2511.03531v1", "published_date": "2025-11-05 15:02:58 UTC", "updated_date": "2025-11-05 15:02:58 UTC" }, { "arxiv_id": "2511.03499v2", "title": "A Theoretical Framework for Environmental Similarity and Vessel Mobility as Coupled Predictors of Marine Invasive Species Pathways", "authors": [ "Gabriel Spadon", "Vaishnav Vaidheeswaran", "Claudio DiBacco" ], "abstract": "Marine invasive species spread through global shipping and generate substantial ecological and economic impacts. Traditional risk assessments require detailed records of ballast water and traffic patterns, which are often incomplete, limiting global coverage. This work advances a theoretical framework that quantifies invasion risk by combining environmental similarity across ports with observed and forecasted maritime mobility. Climate-based feature representations characterize each port's marine conditions, while mobility networks derived from Automatic Identification System data capture vessel flows and potential transfer pathways. Clustering and metric learning reveal climate analogues and enable the estimation of species survival likelihood along shipping routes. A temporal link prediction model captures how traffic patterns may change under shifting environmental conditions. The resulting fusion of environmental similarity and predicted mobility provides exposure estimates at the port and voyage levels, supporting targeted monitoring, routing adjustments, and management interventions.", "categories": [ "cs.CE", "cs.AI" ], "primary_category": "cs.CE", "comment": "Abstract Submitted to the 46th Canadian Conference on Remote Sensing", "pdf_url": "https://arxiv.org/pdf/2511.03499v2", "published_date": "2025-11-05 14:31:39 UTC", "updated_date": "2025-11-06 13:02:53 UTC" }, { "arxiv_id": "2511.03497v1", "title": "ROSBag MCP Server: Analyzing Robot Data with LLMs for Agentic Embodied AI Applications", "authors": [ "Lei Fu", "Sahar Salimpour", "Leonardo Militano", "Harry Edelman", "Jorge Peña Queralta", "Giovanni Toffetti" ], "abstract": "Agentic AI systems and Physical or Embodied AI systems have been two key research verticals at the forefront of Artificial Intelligence and Robotics, with Model Context Protocol (MCP) increasingly becoming a key component and enabler of agentic applications. However, the literature at the intersection of these verticals, i.e., Agentic Embodied AI, remains scarce. This paper introduces an MCP server for analyzing ROS and ROS 2 bags, allowing for analyzing, visualizing and processing robot data with natural language through LLMs and VLMs. We describe specific tooling built with robotics domain knowledge, with our initial release focused on mobile robotics and supporting natively the analysis of trajectories, laser scan data, transforms, or time series data. This is in addition to providing an interface to standard ROS 2 CLI tools (\"ros2 bag list\" or \"ros2 bag info\"), as well as the ability to filter bags with a subset of topics or trimmed in time. Coupled with the MCP server, we provide a lightweight UI that allows the benchmarking of the tooling with different LLMs, both proprietary (Anthropic, OpenAI) and open-source (through Groq). Our experimental results include the analysis of tool calling capabilities of eight different state-of-the-art LLM/VLM models, both proprietary and open-source, large and small. Our experiments indicate that there is a large divide in tool calling capabilities, with Kimi K2 and Claude Sonnet 4 demonstrating clearly superior performance. We also conclude that there are multiple factors affecting the success rates, from the tool description schema to the number of arguments, as well as the number of tools available to the models. The code is available with a permissive license at https://github.com/binabik-ai/mcp-rosbags.", "categories": [ "cs.RO", "cs.AI", "cs.SE" ], "primary_category": "cs.RO", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03497v1", "published_date": "2025-11-05 14:27:58 UTC", "updated_date": "2025-11-05 14:27:58 UTC" }, { "arxiv_id": "2511.03481v2", "title": "Development of the Bioinspired Tendon-Driven DexHand 021 with Proprioceptive Compliance Control", "authors": [ "Jianbo Yuan", "Haohua Zhu", "Jing Dai", "Sheng Yi" ], "abstract": "The human hand plays a vital role in daily life and industrial applications, yet replicating its multifunctional capabilities-including motion, sensing, and coordinated manipulation with robotic systems remains a formidable challenge. Developing a dexterous robotic hand requires balancing human-like agility with engineering constraints such as complexity, size-to-weight ratio, durability, and force-sensing performance. This letter presents Dex-Hand 021, a high-performance, cable-driven five-finger robotic hand with 12 active and 7 passive degrees of freedom (DoFs), achieving 19 DoFs dexterity in a lightweight 1 kg design. We propose a proprioceptive force-sensing-based admittance control method to enhance manipulation. Experimental results demonstrate its superior performance: a single-finger load capacity exceeding 10 N, fingertip repeatability under 0.001 m, and force estimation errors below 0.2 N. Compared to PID control, joint torques in multi-object grasping are reduced by 31.19%, significantly improves force-sensing capability while preventing overload during collisions. The hand excels in both power and precision grasps, successfully executing 33 GRASP taxonomy motions and complex manipulation tasks. This work advances the design of lightweight, industrial-grade dexterous hands and enhances proprioceptive control, contributing to robotic manipulation and intelligent manufacturing.", "categories": [ "cs.RO", "cs.AI" ], "primary_category": "cs.RO", "comment": "8 pages 18 fogures, IEEE RAL accept", "pdf_url": "https://arxiv.org/pdf/2511.03481v2", "published_date": "2025-11-05 14:07:03 UTC", "updated_date": "2025-11-09 02:49:13 UTC" }, { "arxiv_id": "2511.05595v1", "title": "FlowNet: Modeling Dynamic Spatio-Temporal Systems via Flow Propagation", "authors": [ "Yutong Feng", "Xu Liu", "Yutong Xia", "Yuxuan Liang" ], "abstract": "Accurately modeling complex dynamic spatio-temporal systems requires capturing flow-mediated interdependencies and context-sensitive interaction dynamics. Existing methods, predominantly graph-based or attention-driven, rely on similarity-driven connectivity assumptions, neglecting asymmetric flow exchanges that govern system evolution. We propose Spatio-Temporal Flow, a physics-inspired paradigm that explicitly models dynamic node couplings through quantifiable flow transfers governed by conservation principles. Building on this, we design FlowNet, a novel architecture leveraging flow tokens as information carriers to simulate source-to-destination transfers via Flow Allocation Modules, ensuring state redistribution aligns with conservation laws. FlowNet dynamically adjusts the interaction radius through an Adaptive Spatial Masking module, suppressing irrelevant noise while enabling context-aware propagation. A cascaded architecture enhances scalability and nonlinear representation capacity. Experiments demonstrate that FlowNet significantly outperforms existing state-of-the-art approaches on seven metrics in the modeling of three real-world systems, validating its efficiency and physical interpretability. We establish a principled methodology for modeling complex systems through spatio-temporal flow interactions.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.05595v1", "published_date": "2025-11-05 14:06:19 UTC", "updated_date": "2025-11-05 14:06:19 UTC" }, { "arxiv_id": "2511.03471v1", "title": "Towards Scalable Web Accessibility Audit with MLLMs as Copilots", "authors": [ "Ming Gu", "Ziwei Wang", "Sicen Lai", "Zirui Gao", "Sheng Zhou", "Jiajun Bu" ], "abstract": "Ensuring web accessibility is crucial for advancing social welfare, justice, and equality in digital spaces, yet the vast majority of website user interfaces remain non-compliant, due in part to the resource-intensive and unscalable nature of current auditing practices. While WCAG-EM offers a structured methodology for site-wise conformance evaluation, it involves great human efforts and lacks practical support for execution at scale. In this work, we present an auditing framework, AAA, which operationalizes WCAG-EM through a human-AI partnership model. AAA is anchored by two key innovations: GRASP, a graph-based multimodal sampling method that ensures representative page coverage via learned embeddings of visual, textual, and relational cues; and MaC, a multimodal large language model-based copilot that supports auditors through cross-modal reasoning and intelligent assistance in high-effort tasks. Together, these components enable scalable, end-to-end web accessibility auditing, empowering human auditors with AI-enhanced assistance for real-world impact. We further contribute four novel datasets designed for benchmarking core stages of the audit pipeline. Extensive experiments demonstrate the effectiveness of our methods, providing insights that small-scale language models can serve as capable experts when fine-tuned.", "categories": [ "cs.AI", "cs.HC" ], "primary_category": "cs.AI", "comment": "15 pages. Accepted by AAAI 2026 AISI", "pdf_url": "https://arxiv.org/pdf/2511.03471v1", "published_date": "2025-11-05 13:50:19 UTC", "updated_date": "2025-11-05 13:50:19 UTC" }, { "arxiv_id": "2511.04711v1", "title": "SWAP: Towards Copyright Auditing of Soft Prompts via Sequential Watermarking", "authors": [ "Wenyuan Yang", "Yichen Sun", "Changzheng Chen", "Zhixuan Chu", "Jiaheng Zhang", "Yiming Li", "Dacheng Tao" ], "abstract": "Large-scale vision-language models, especially CLIP, have demonstrated remarkable performance across diverse downstream tasks. Soft prompts, as carefully crafted modules that efficiently adapt vision-language models to specific tasks, necessitate effective copyright protection. In this paper, we investigate model copyright protection by auditing whether suspicious third-party models incorporate protected soft prompts. While this can be viewed as a special case of model ownership auditing, our analysis shows that existing techniques are ineffective due to prompt learning's unique characteristics. Non-intrusive auditing is inherently prone to false positives when independent models share similar data distributions with victim models. Intrusive approaches also fail: backdoor methods designed for CLIP cannot embed functional triggers, while extending traditional DNN backdoor techniques to prompt learning suffers from harmfulness and ambiguity challenges. We find that these failures in intrusive auditing stem from the same fundamental reason: watermarking operates within the same decision space as the primary task yet pursues opposing objectives. Motivated by these findings, we propose sequential watermarking for soft prompts (SWAP), which implants watermarks into a different and more complex space. SWAP encodes watermarks through a specific order of defender-specified out-of-distribution classes, inspired by the zero-shot prediction capability of CLIP. This watermark, which is embedded in a more complex space, keeps the original prediction label unchanged, making it less opposed to the primary task. We further design a hypothesis-test-guided verification protocol for SWAP and provide theoretical analyses of success conditions. Extensive experiments on 11 datasets demonstrate SWAP's effectiveness, harmlessness, and robustness against potential adaptive attacks.", "categories": [ "cs.CR", "cs.AI", "cs.LG" ], "primary_category": "cs.CR", "comment": "The first two authors contributed equally to this work. 27 pages", "pdf_url": "https://arxiv.org/pdf/2511.04711v1", "published_date": "2025-11-05 13:48:48 UTC", "updated_date": "2025-11-05 13:48:48 UTC" }, { "arxiv_id": "2511.03441v2", "title": "CareMedEval dataset: Evaluating Critical Appraisal and Reasoning in the Biomedical Field", "authors": [ "Doria Bonzi", "Alexandre Guiggi", "Frédéric Béchet", "Carlos Ramisch", "Benoit Favre" ], "abstract": "Critical appraisal of scientific literature is an essential skill in the biomedical field. While large language models (LLMs) can offer promising support in this task, their reliability remains limited, particularly for critical reasoning in specialized domains. We introduce CareMedEval, an original dataset designed to evaluate LLMs on biomedical critical appraisal and reasoning tasks. Derived from authentic exams taken by French medical students, the dataset contains 534 questions based on 37 scientific articles. Unlike existing benchmarks, CareMedEval explicitly evaluates critical reading and reasoning grounded in scientific papers. Benchmarking state-of-the-art generalist and biomedical-specialized LLMs under various context conditions reveals the difficulty of the task: open and commercial models fail to exceed an Exact Match Rate of 0.5 even though generating intermediate reasoning tokens considerably improves the results. Yet, models remain challenged especially on questions about study limitations and statistical analysis. CareMedEval provides a challenging benchmark for grounded reasoning, exposing current LLM limitations and paving the way for future development of automated support for critical appraisal.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "Preprint submitted to LREC 2026 (under review) To access the dataset, see https://github.com/bonzid/CareMedEval", "pdf_url": "https://arxiv.org/pdf/2511.03441v2", "published_date": "2025-11-05 13:02:06 UTC", "updated_date": "2025-11-06 11:06:10 UTC" }, { "arxiv_id": "2511.03434v1", "title": "Inter-Agent Trust Models: A Comparative Study of Brief, Claim, Proof, Stake, Reputation and Constraint in Agentic Web Protocol Design-A2A, AP2, ERC-8004, and Beyond", "authors": [ "Botao 'Amber' Hu", "Helena Rong" ], "abstract": "As the \"agentic web\" takes shape-billions of AI agents (often LLM-powered) autonomously transacting and collaborating-trust shifts from human oversight to protocol design. In 2025, several inter-agent protocols crystallized this shift, including Google's Agent-to-Agent (A2A), Agent Payments Protocol (AP2), and Ethereum's ERC-8004 \"Trustless Agents,\" yet their underlying trust assumptions remain under-examined. This paper presents a comparative study of trust models in inter-agent protocol design: Brief (self- or third-party verifiable claims), Claim (self-proclaimed capabilities and identity, e.g. AgentCard), Proof (cryptographic verification, including zero-knowledge proofs and trusted execution environment attestations), Stake (bonded collateral with slashing and insurance), Reputation (crowd feedback and graph-based trust signals), and Constraint (sandboxing and capability bounding). For each, we analyze assumptions, attack surfaces, and design trade-offs, with particular emphasis on LLM-specific fragilities-prompt injection, sycophancy/nudge-susceptibility, hallucination, deception, and misalignment-that render purely reputational or claim-only approaches brittle. Our findings indicate no single mechanism suffices. We argue for trustless-by-default architectures anchored in Proof and Stake to gate high-impact actions, augmented by Brief for identity and discovery and Reputation overlays for flexibility and social signals. We comparatively evaluate A2A, AP2, ERC-8004 and related historical variations in academic research under metrics spanning security, privacy, latency/cost, and social robustness (Sybil/collusion/whitewashing resistance). We conclude with hybrid trust model recommendations that mitigate reputation gaming and misinformed LLM behavior, and we distill actionable design guidelines for safer, interoperable, and scalable agent economies.", "categories": [ "cs.HC", "cs.AI", "cs.MA", "cs.NI", "cs.SI" ], "primary_category": "cs.HC", "comment": "Submitted to AAAI 2026 Workshop on Trust and Control in Agentic AI (TrustAgent)", "pdf_url": "https://arxiv.org/pdf/2511.03434v1", "published_date": "2025-11-05 12:50:06 UTC", "updated_date": "2025-11-05 12:50:06 UTC" }, { "arxiv_id": "2511.03421v1", "title": "Light over Heavy: Automated Performance Requirements Quantification with Linguistic Inducement", "authors": [ "Shihai Wang", "Tao Chen" ], "abstract": "Elicited performance requirements need to be quantified for compliance in different engineering tasks, e.g., configuration tuning and performance testing. Much existing work has relied on manual quantification, which is expensive and error-prone due to the imprecision. In this paper, we present LQPR, a highly efficient automatic approach for performance requirements quantification.LQPR relies on a new theoretical framework that converts quantification as a classification problem. Despite the prevalent applications of Large Language Models (LLMs) for requirement analytics, LQPR takes a different perspective to address the classification: we observed that performance requirements can exhibit strong patterns and are often short/concise, therefore we design a lightweight linguistically induced matching mechanism. We compare LQPR against nine state-of-the-art learning-based approaches over diverse datasets, demonstrating that it is ranked as the sole best for 75% or more cases with two orders less cost. Our work proves that, at least for performance requirement quantification, specialized methods can be more suitable than the general LLM-driven approaches.", "categories": [ "cs.SE", "cs.AI" ], "primary_category": "cs.SE", "comment": "accepted by ICSE 2026", "pdf_url": "https://arxiv.org/pdf/2511.03421v1", "published_date": "2025-11-05 12:38:11 UTC", "updated_date": "2025-11-05 12:38:11 UTC" }, { "arxiv_id": "2511.03405v1", "title": "Adaptable Hindsight Experience Replay for Search-Based Learning", "authors": [ "Alexandros Vazaios", "Jannis Brugger", "Cedric Derstroff", "Kristian Kersting", "Mira Mezini" ], "abstract": "AlphaZero-like Monte Carlo Tree Search systems, originally introduced for two-player games, dynamically balance exploration and exploitation using neural network guidance. This combination makes them also suitable for classical search problems. However, the original method of training the network with simulation results is limited in sparse reward settings, especially in the early stages, where the network cannot yet give guidance. Hindsight Experience Replay (HER) addresses this issue by relabeling unsuccessful trajectories from the search tree as supervised learning signals. We introduce Adaptable HER (\\ours{}), a flexible framework that integrates HER with AlphaZero, allowing easy adjustments to HER properties such as relabeled goals, policy targets, and trajectory selection. Our experiments, including equation discovery, show that the possibility of modifying HER is beneficial and surpasses the performance of pure supervised or reinforcement learning.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "8 pages, 2 figures, Presented at the 9th International Workshop on Interactive Adaptive Learning", "pdf_url": "https://arxiv.org/pdf/2511.03405v1", "published_date": "2025-11-05 12:13:23 UTC", "updated_date": "2025-11-05 12:13:23 UTC" }, { "arxiv_id": "2511.11616v1", "title": "Hierarchical Federated Graph Attention Networks for Scalable and Resilient UAV Collision Avoidance", "authors": [ "Rathin Chandra Shit", "Sharmila Subudhi" ], "abstract": "The real-time performance, adversarial resiliency, and privacy preservation are the most important metrics that need to be balanced to practice collision avoidance in large-scale multi-UAV (Unmanned Aerial Vehicle) systems. Current frameworks tend to prescribe monolithic solutions that are not only prohibitively computationally complex with a scaling cost of $O(n^2)$ but simply do not offer Byzantine fault tolerance. The proposed hierarchical framework presented in this paper tries to eliminate such trade-offs by stratifying a three-layered architecture. We spread the intelligence into three layers: an immediate collision avoiding local layer running on dense graph attention with latency of $<10 ms$, a regional layer using sparse attention with $O(nk)$ computational complexity and asynchronous federated learning with coordinate-wise trimmed mean aggregation, and lastly, a global layer using a lightweight Hashgraph-inspired protocol. We have proposed an adaptive differential privacy mechanism, wherein the noise level $(ε\\in [0.1, 1.0])$ is dynamically reduced based on an evaluation of the measured real-time threat that in turn maximized the privacy-utility tradeoff. Through the use of Distributed Hash Table (DHT)-based lightweight audit logging instead of heavyweight blockchain consensus, the median cost of getting a $95^{th}$ percentile decision within 50ms is observed across all tested swarm sizes. This architecture provides a scalable scenario of 500 UAVs with a collision rate of $< 2.0\\%$ and the Byzantine fault tolerance of $f < n/3$.", "categories": [ "cs.RO", "cs.AI", "cs.LG", "cs.MA" ], "primary_category": "cs.RO", "comment": "Accepted and scheduled for conference presentation", "pdf_url": "https://arxiv.org/pdf/2511.11616v1", "published_date": "2025-11-05 12:01:00 UTC", "updated_date": "2025-11-05 12:01:00 UTC" }, { "arxiv_id": "2511.05589v1", "title": "CoPRIS: Efficient and Stable Reinforcement Learning via Concurrency-Controlled Partial Rollout with Importance Sampling", "authors": [ "Zekai Qu", "Yinxu Pan", "Ao Sun", "Chaojun Xiao", "Xu Han" ], "abstract": "Reinforcement learning (RL) post-training has become a trending paradigm for enhancing the capabilities of large language models (LLMs). Most existing RL systems for LLMs operate in a fully synchronous manner, where training must wait for the rollout of an entire batch to complete. This design leads to severe inefficiencies, as extremely long trajectories can stall the entire rollout process and leave many GPUs idle. To address this issue, we propose Concurrency- Controlled Partial Rollout with Importance Sampling (CoPRIS), which mitigates long-tail inefficiencies by maintaining a fixed number of concurrent rollouts, early-terminating once sufficient samples are collected, and reusing unfinished trajectories in subsequent rollouts. To mitigate the impact of off-policy trajectories, we introduce Cross-stage Importance Sampling Correction, which concatenates buffered log probabilities from the previous policy with those recomputed under the current policy for importance sampling correction. Experiments on challenging mathematical reasoning benchmarks show that CoPRIS achieves up to 1.94x faster training while maintaining comparable or superior performance to synchronous RL systems. The code of CoPRIS is available at https://github.com/777pomingzi/CoPRIS.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "13 pages, 4 figures", "pdf_url": "https://arxiv.org/pdf/2511.05589v1", "published_date": "2025-11-05 11:39:32 UTC", "updated_date": "2025-11-05 11:39:32 UTC" }, { "arxiv_id": "2511.03376v1", "title": "Computational Imaging Meets LLMs: Zero-Shot IDH Mutation Prediction in Brain Gliomas", "authors": [ "Syed Muqeem Mahmood", "Hassan Mohy-ud-Din" ], "abstract": "We present a framework that combines Large Language Models with computational image analytics for non-invasive, zero-shot prediction of IDH mutation status in brain gliomas. For each subject, coregistered multi-parametric MRI scans and multi-class tumor segmentation maps were processed to extract interpretable semantic (visual) attributes and quantitative features, serialized in a standardized JSON file, and used to query GPT 4o and GPT 5 without fine-tuning. We evaluated this framework on six publicly available datasets (N = 1427) and results showcased high accuracy and balanced classification performance across heterogeneous cohorts, even in the absence of manual annotations. GPT 5 outperformed GPT 4o in context-driven phenotype interpretation. Volumetric features emerged as the most important predictors, supplemented by subtype-specific imaging markers and clinical information. Our results demonstrate the potential of integrating LLM-based reasoning with computational image analytics for precise, non-invasive tumor genotyping, advancing diagnostic strategies in neuro-oncology. The code is available at https://github.com/ATPLab-LUMS/CIM-LLM.", "categories": [ "eess.IV", "cs.AI", "q-bio.QM" ], "primary_category": "eess.IV", "comment": "5 pages, 1 figure, 3 tables", "pdf_url": "https://arxiv.org/pdf/2511.03376v1", "published_date": "2025-11-05 11:31:08 UTC", "updated_date": "2025-11-05 11:31:08 UTC" }, { "arxiv_id": "2511.03367v1", "title": "Decoupling Augmentation Bias in Prompt Learning for Vision-Language Models", "authors": [ "Gahyeon Kim", "Sohee Kim", "Seokju Lee" ], "abstract": "Recent advances in large-scale vision and language models have led to significant progress in zero-shot learning tasks. Methods such as CoOp and CoCoOp have shown that replacing handcrafted prompts with learnable vectors, known as prompt learning, can result in improved performance. However, these models often struggle to generalize to entirely unseen categories. While traditional zero-shot learning techniques benefit from various data augmentation strategies, prompt learning has primarily focused on text-based modifications, leaving the potential of image-based augmentation largely unexplored. In this work, we explore how image-level augmentations, particularly those that introduce attribute-specific variations, can support and enhance prompt learning. Our analysis examines the interaction between these augmentations and soft prompt frameworks, revealing their potential to improve generalization. We also identify a limitation in existing methods, such as CoCoOp, which do not provide explicit guidance for learning prompts that focus on semantically meaningful visual features. To address this, we propose Adding Attributes to Prompt Learning, AAPL, a novel method that introduces adversarial token embeddings to decouple superficial visual variations introduced by augmentation from class-relevant semantic representations. This decoupling enables the learned prompts to concentrate on visually discriminative features that align with the target categories. We conduct comprehensive experiments on eleven benchmark datasets, and AAPL consistently outperforms existing methods across few-shot, zero-shot, cross-dataset, and domain generalization settings. Our source code is publicly available at: https://github.com/Gahyeonkim09/AAPL", "categories": [ "cs.CV", "cs.AI", "cs.LG" ], "primary_category": "cs.CV", "comment": "Accepted in Pattern Recognition", "pdf_url": "https://arxiv.org/pdf/2511.03367v1", "published_date": "2025-11-05 11:15:16 UTC", "updated_date": "2025-11-05 11:15:16 UTC" }, { "arxiv_id": "2511.03361v1", "title": "Open Source State-Of-the-Art Solution for Romanian Speech Recognition", "authors": [ "Gabriel Pirlogeanu", "Alexandru-Lucian Georgescu", "Horia Cucu" ], "abstract": "In this work, we present a new state-of-the-art Romanian Automatic Speech Recognition (ASR) system based on NVIDIA's FastConformer architecture--explored here for the first time in the context of Romanian. We train our model on a large corpus of, mostly, weakly supervised transcriptions, totaling over 2,600 hours of speech. Leveraging a hybrid decoder with both Connectionist Temporal Classification (CTC) and Token-Duration Transducer (TDT) branches, we evaluate a range of decoding strategies including greedy, ALSD, and CTC beam search with a 6-gram token-level language model. Our system achieves state-of-the-art performance across all Romanian evaluation benchmarks, including read, spontaneous, and domain-specific speech, with up to 27% relative WER reduction compared to previous best-performing systems. In addition to improved transcription accuracy, our approach demonstrates practical decoding efficiency, making it suitable for both research and deployment in low-latency ASR applications.", "categories": [ "eess.AS", "cs.AI" ], "primary_category": "eess.AS", "comment": "13th Conference on Speech Technology and Human-Computer Dialogue (SpeD 2025), Cluj-Napoca, Romania", "pdf_url": "https://arxiv.org/pdf/2511.03361v1", "published_date": "2025-11-05 11:02:16 UTC", "updated_date": "2025-11-05 11:02:16 UTC" }, { "arxiv_id": "2511.03354v1", "title": "Generative Artificial Intelligence in Bioinformatics: A Systematic Review of Models, Applications, and Methodological Advances", "authors": [ "Riasad Alvi", "Sayeem Been Zaman", "Wasimul Karim", "Arefin Ittesafun Abian", "Mohaimenul Azam Khan Raiaan", "Saddam Mukta", "Md Rafi Ur Rashid", "Md Rafiqul Islam", "Yakub Sebastian", "Sami Azam" ], "abstract": "Generative artificial intelligence (GenAI) has become a transformative approach in bioinformatics that often enables advancements in genomics, proteomics, transcriptomics, structural biology, and drug discovery. To systematically identify and evaluate these growing developments, this review proposed six research questions (RQs), according to the preferred reporting items for systematic reviews and meta-analysis methods. The objective is to evaluate impactful GenAI strategies in methodological advancement, predictive performance, and specialization, and to identify promising approaches for advanced modeling, data-intensive discovery, and integrative biological analysis. RQ1 highlights diverse applications across multiple bioinformatics subfields (sequence analysis, molecular design, and integrative data modeling), which demonstrate superior performance over traditional methods through pattern recognition and output generation. RQ2 reveals that adapted specialized model architectures outperformed general-purpose models, an advantage attributed to targeted pretraining and context-aware strategies. RQ3 identifies significant benefits in the bioinformatics domains, focusing on molecular analysis and data integration, which improves accuracy and reduces errors in complex analysis. RQ4 indicates improvements in structural modeling, functional prediction, and synthetic data generation, validated by established benchmarks. RQ5 suggests the main constraints, such as the lack of scalability and biases in data that impact generalizability, and proposes future directions focused on robust evaluation and biologically grounded modeling. RQ6 examines that molecular datasets (such as UniProtKB and ProteinNet12), cellular datasets (such as CELLxGENE and GTEx) and textual resources (such as PubMedQA and OMIM) broadly support the training and generalization of GenAI models.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03354v1", "published_date": "2025-11-05 10:48:36 UTC", "updated_date": "2025-11-05 10:48:36 UTC" }, { "arxiv_id": "2511.03761v1", "title": "OptiMA: A Transaction-Based Framework with Throughput Optimization for Very Complex Multi-Agent Systems", "authors": [ "Umut Çalıkyılmaz", "Nitin Nayak", "Jinghua Groppe", "Sven Groppe" ], "abstract": "In recent years, the research of multi-agent systems has taken a direction to explore larger and more complex models to fulfill sophisticated tasks. We point out two possible pitfalls that might be caused by increasing complexity; susceptibilities to faults, and performance bottlenecks. To prevent the former threat, we propose a transaction-based framework to design very complex multi-agent systems (VCMAS). To address the second threat, we offer to integrate transaction scheduling into the proposed framework. We implemented both of these ideas to develop the OptiMA framework and show that it is able to facilitate the execution of VCMAS with more than a hundred agents. We also demonstrate the effect of transaction scheduling on such a system by showing improvements up to more than 16\\%. Furthermore, we also performed a theoretical analysis on the transaction scheduling problem and provided practical tools that can be used for future research on it.", "categories": [ "cs.MA", "cs.AI", "cs.DB" ], "primary_category": "cs.MA", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03761v1", "published_date": "2025-11-05 10:23:24 UTC", "updated_date": "2025-11-05 10:23:24 UTC" }, { "arxiv_id": "2511.08609v1", "title": "Case Study: Transformer-Based Solution for the Automatic Digitization of Gas Plants", "authors": [ "I. Bailo", "F. Buonora", "G. Ciarfaglia", "L. T. Consoli", "A. Evangelista", "M. Gabusi", "M. Ghiani", "C. Petracca Ciavarella", "F. Picariello", "F. Sarcina", "F. Tuosto", "V. Zullo", "L. Airoldi", "G. Bruno", "D. D. Gobbo", "S. Pezzenati", "G. A. Tona" ], "abstract": "The energy transition is a key theme of the last decades to determine a future of eco-sustainability, and an area of such importance cannot disregard digitization, innovation and the new technological tools available. This is the context in which the Generative Artificial Intelligence models described in this paper are positioned, developed by Engineering Ingegneria Informatica SpA in order to automate the plant structures acquisition of SNAM energy infrastructure, a leading gas transportation company in Italy and Europe. The digitization of a gas plant consists in registering all its relevant information through the interpretation of the related documentation. The aim of this work is therefore to design an effective solution based on Artificial Intelligence techniques to automate the extraction of the information necessary for the digitization of a plant, in order to streamline the daily work of MGM users. The solution received the P&ID of the plant as input, each one in pdf format, and uses OCR, Vision LLM, Object Detection, Relational Reasoning and optimization algorithms to return an output consisting of two sets of information: a structured overview of the relevant design data and the hierarchical framework of the plant. To achieve convincing results, we extend a state-of-the-art model for Scene Graph Generation introducing a brand new Transformer architecture with the aim of deepening the analysis of the complex relations between the plant's components. The synergistic use of the listed AI-based technologies allowed to overcome many obstacles arising from the high variety of data, due to the lack of standardization. An accuracy of 91\\% has been achieved in the extraction of textual information relating to design data. Regarding the plants topology, 93\\% of components are correctly identified and the hierarchical structure is extracted with an accuracy around 80\\%.", "categories": [ "cs.CV", "cs.AI", "cs.LG" ], "primary_category": "cs.CV", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.08609v1", "published_date": "2025-11-05 10:19:12 UTC", "updated_date": "2025-11-05 10:19:12 UTC" }, { "arxiv_id": "2511.03330v1", "title": "Discourse-Aware Scientific Paper Recommendation via QA-Style Summarization and Multi-Level Contrastive Learning", "authors": [ "Shenghua Wang", "Zhen Yin" ], "abstract": "The rapid growth of open-access (OA) publications has intensified the challenge of identifying relevant scientific papers. Due to privacy constraints and limited access to user interaction data, recent efforts have shifted toward content-based recommendation, which relies solely on textual information. However, existing models typically treat papers as unstructured text, neglecting their discourse organization and thereby limiting semantic completeness and interpretability. To address these limitations, we propose OMRC-MR, a hierarchical framework that integrates QA-style OMRC (Objective, Method, Result, Conclusion) summarization, multi-level contrastive learning, and structure-aware re-ranking for scholarly recommendation. The QA-style summarization module converts raw papers into structured and discourse-consistent representations, while multi-level contrastive objectives align semantic representations across metadata, section, and document levels. The final re-ranking stage further refines retrieval precision through contextual similarity calibration. Experiments on DBLP, S2ORC, and the newly constructed Sci-OMRC dataset demonstrate that OMRC-MR consistently surpasses state-of-the-art baselines, achieving up to 7.2% and 3.8% improvements in Precision@10 and Recall@10, respectively. Additional evaluations confirm that QA-style summarization produces more coherent and factually complete representations. Overall, OMRC-MR provides a unified and interpretable content-based paradigm for scientific paper recommendation, advancing trustworthy and privacy-aware scholarly information retrieval.", "categories": [ "cs.IR", "cs.AI" ], "primary_category": "cs.IR", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03330v1", "published_date": "2025-11-05 09:55:12 UTC", "updated_date": "2025-11-05 09:55:12 UTC" }, { "arxiv_id": "2511.03328v1", "title": "Benchmarking the Thinking Mode of Multimodal Large Language Models in Clinical Tasks", "authors": [ "Jindong Hong", "Tianjie Chen", "Lingjie Luo", "Chuanyang Zheng", "Ting Xu", "Haibao Yu", "Jianing Qiu", "Qianzhong Chen", "Suning Huang", "Yan Xu", "Yong Gui", "Yijun He", "Jiankai Sun" ], "abstract": "A recent advancement in Multimodal Large Language Models (MLLMs) research is the emergence of \"reasoning MLLMs\" that offer explicit control over their internal thinking processes (normally referred as the \"thinking mode\") alongside the standard \"non-thinking mode\". This capability allows these models to engage in a step-by-step process of internal deliberation before generating a final response. With the rapid transition to and adoption of these \"dual-state\" MLLMs, this work rigorously evaluated how the enhanced reasoning processes of these MLLMs impact model performance and reliability in clinical tasks. This paper evaluates the active \"thinking mode\" capabilities of two leading MLLMs, Seed1.5-VL and Gemini-2.5-Flash, for medical applications. We assessed their performance on four visual medical tasks using VQA-RAD and ROCOv2 datasets. Our findings reveal that the improvement from activating the thinking mode remains marginal compared to the standard non-thinking mode for the majority of the tasks. Their performance on complex medical tasks such as open-ended VQA and medical image interpretation remains suboptimal, highlighting the need for domain-specific medical data and more advanced methods for medical knowledge integration.", "categories": [ "cs.CL", "cs.AI", "cs.CV", "cs.LG" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03328v1", "published_date": "2025-11-05 09:47:15 UTC", "updated_date": "2025-11-05 09:47:15 UTC" }, { "arxiv_id": "2511.03304v1", "title": "Extending Fair Null-Space Projections for Continuous Attributes to Kernel Methods", "authors": [ "Felix Störck", "Fabian Hinder", "Barbara Hammer" ], "abstract": "With the on-going integration of machine learning systems into the everyday social life of millions the notion of fairness becomes an ever increasing priority in their development. Fairness notions commonly rely on protected attributes to assess potential biases. Here, the majority of literature focuses on discrete setups regarding both target and protected attributes. The literature on continuous attributes especially in conjunction with regression -- we refer to this as \\emph{continuous fairness} -- is scarce. A common strategy is iterative null-space projection which as of now has only been explored for linear models or embeddings such as obtained by a non-linear encoder. We improve on this by generalizing to kernel methods, significantly extending the scope. This yields a model and fairness-score agnostic method for kernel embeddings applicable to continuous protected attributes. We demonstrate that our novel approach in conjunction with Support Vector Regression (SVR) provides competitive or improved performance across multiple datasets in comparisons to other contemporary methods.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03304v1", "published_date": "2025-11-05 09:17:08 UTC", "updated_date": "2025-11-05 09:17:08 UTC" }, { "arxiv_id": "2511.03295v2", "title": "How to Evaluate Speech Translation with Source-Aware Neural MT Metrics", "authors": [ "Mauro Cettolo", "Marco Gaido", "Matteo Negri", "Sara Papi", "Luisa Bentivogli" ], "abstract": "Automatic evaluation of speech-to-text translation (ST) systems is typically performed by comparing translation hypotheses with one or more reference translations. While effective to some extent, this approach inherits the limitation of reference-based evaluation that ignores valuable information from the source input. In machine translation (MT), recent progress has shown that neural metrics incorporating the source text achieve stronger correlation with human judgments. Extending this idea to ST, however, is not trivial because the source is audio rather than text, and reliable transcripts or alignments between source and references are often unavailable. In this work, we conduct the first systematic study of source-aware metrics for ST, with a particular focus on real-world operating conditions where source transcripts are not available. We explore two complementary strategies for generating textual proxies of the input audio, automatic speech recognition (ASR) transcripts, and back-translations of the reference translation, and introduce a novel two-step cross-lingual re-segmentation algorithm to address the alignment mismatch between synthetic sources and reference translations. Our experiments, carried out on two ST benchmarks covering 79 language pairs and six ST systems with diverse architectures and performance levels, show that ASR transcripts constitute a more reliable synthetic source than back-translations when word error rate is below 20%, while back-translations always represent a computationally cheaper but still effective alternative. Furthermore, our cross-lingual re-segmentation algorithm enables robust use of source-aware MT metrics in ST evaluation, paving the way toward more accurate and principled evaluation methodologies for speech translation.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03295v2", "published_date": "2025-11-05 08:49:22 UTC", "updated_date": "2025-11-11 14:48:53 UTC" }, { "arxiv_id": "2511.03758v3", "title": "Leveraging LLM-based agents for social science research: insights from citation network simulations", "authors": [ "Jiarui Ji", "Runlin Lei", "Xuchen Pan", "Zhewei Wei", "Hao Sun", "Yankai Lin", "Xu Chen", "Yongzheng Yang", "Yaliang Li", "Bolin Ding", "Ji-Rong Wen" ], "abstract": "The emergence of Large Language Models (LLMs) demonstrates their potential to encapsulate the logic and patterns inherent in human behavior simulation by leveraging extensive web data pre-training. However, the boundaries of LLM capabilities in social simulation remain unclear. To further explore the social attributes of LLMs, we introduce the CiteAgent framework, designed to generate citation networks based on human-behavior simulation with LLM-based agents. CiteAgent successfully captures predominant phenomena in real-world citation networks, including power-law distribution, citational distortion, and shrinking diameter. Building on this realistic simulation, we establish two LLM-based research paradigms in social science: LLM-SE (LLM-based Survey Experiment) and LLM-LE (LLM-based Laboratory Experiment). These paradigms facilitate rigorous analyses of citation network phenomena, allowing us to validate and challenge existing theories. Additionally, we extend the research scope of traditional science of science studies through idealized social experiments, with the simulation experiment results providing valuable insights for real-world academic environments. Our work demonstrates the potential of LLMs for advancing science of science research in social science.", "categories": [ "physics.soc-ph", "cs.AI", "cs.CY", "cs.MA", "cs.SI" ], "primary_category": "physics.soc-ph", "comment": "accepted by HSSCOMMS'25", "pdf_url": "https://arxiv.org/pdf/2511.03758v3", "published_date": "2025-11-05 08:47:04 UTC", "updated_date": "2025-11-18 08:10:15 UTC" }, { "arxiv_id": "2511.03282v1", "title": "When Generative Artificial Intelligence meets Extended Reality: A Systematic Review", "authors": [ "Xinyu Ning", "Yan Zhuo", "Xian Wang", "Chan-In Devin Sio", "Lik-Hang Lee" ], "abstract": "With the continuous advancement of technology, the application of generative artificial intelligence (AI) in various fields is gradually demonstrating great potential, particularly when combined with Extended Reality (XR), creating unprecedented possibilities. This survey article systematically reviews the applications of generative AI in XR, covering as much relevant literature as possible from 2023 to 2025. The application areas of generative AI in XR and its key technology implementations are summarised through PRISMA screening and analysis of the final 26 articles. The survey highlights existing articles from the last three years related to how XR utilises generative AI, providing insights into current trends and research gaps. We also explore potential opportunities for future research to further empower XR through generative AI, providing guidance and information for future generative XR research.", "categories": [ "cs.HC", "cs.AI", "cs.LG" ], "primary_category": "cs.HC", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03282v1", "published_date": "2025-11-05 08:24:48 UTC", "updated_date": "2025-11-05 08:24:48 UTC" }, { "arxiv_id": "2511.03261v1", "title": "Comparing the Performance of LLMs in RAG-based Question-Answering: A Case Study in Computer Science Literature", "authors": [ "Ranul Dayarathne", "Uvini Ranaweera", "Upeksha Ganegoda" ], "abstract": "Retrieval Augmented Generation (RAG) is emerging as a powerful technique to enhance the capabilities of Generative AI models by reducing hallucination. Thus, the increasing prominence of RAG alongside Large Language Models (LLMs) has sparked interest in comparing the performance of different LLMs in question-answering (QA) in diverse domains. This study compares the performance of four open-source LLMs, Mistral-7b-instruct, LLaMa2-7b-chat, Falcon-7b-instruct and Orca-mini-v3-7b, and OpenAI's trending GPT-3.5 over QA tasks within the computer science literature leveraging RAG support. Evaluation metrics employed in the study include accuracy and precision for binary questions and ranking by a human expert, ranking by Google's AI model Gemini, alongside cosine similarity for long-answer questions. GPT-3.5, when paired with RAG, effectively answers binary and long-answer questions, reaffirming its status as an advanced LLM. Regarding open-source LLMs, Mistral AI's Mistral-7b-instruct paired with RAG surpasses the rest in answering both binary and long-answer questions. However, among the open-source LLMs, Orca-mini-v3-7b reports the shortest average latency in generating responses, whereas LLaMa2-7b-chat by Meta reports the highest average latency. This research underscores the fact that open-source LLMs, too, can go hand in hand with proprietary models like GPT-3.5 with better infrastructure.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "18 pages, 4 figures, 5 tables, presented at the 5th International Conference on Artificial Intelligence in Education Technology", "pdf_url": "https://arxiv.org/pdf/2511.03261v1", "published_date": "2025-11-05 07:45:53 UTC", "updated_date": "2025-11-05 07:45:53 UTC" }, { "arxiv_id": "2511.03255v1", "title": "Generative deep learning for foundational video translation in ultrasound", "authors": [ "Nikolina Tomic Roshni Bhatnagar", "Sarthak Jain", "Connor Lau", "Tien-Yu Liu", "Laura Gambini", "Rima Arnaout" ], "abstract": "Deep learning (DL) has the potential to revolutionize image acquisition and interpretation across medicine, however, attention to data imbalance and missingness is required. Ultrasound data presents a particular challenge because in addition to different views and structures, it includes several sub-modalities-such as greyscale and color flow doppler (CFD)-that are often imbalanced in clinical studies. Image translation can help balance datasets but is challenging for ultrasound sub-modalities to date. Here, we present a generative method for ultrasound CFD-greyscale video translation, trained on 54,975 videos and tested on 8,368. The method developed leveraged pixel-wise, adversarial, and perceptual loses and utilized two networks: one for reconstructing anatomic structures and one for denoising to achieve realistic ultrasound imaging. Average pairwise SSIM between synthetic videos and ground truth was 0.91+/-0.04. Synthetic videos performed indistinguishably from real ones in DL classification and segmentation tasks and when evaluated by blinded clinical experts: F1 score was 0.9 for real and 0.89 for synthetic videos; Dice score between real and synthetic segmentation was 0.97. Overall clinician accuracy in distinguishing real vs synthetic videos was 54+/-6% (42-61%), indicating realistic synthetic videos. Although trained only on heart videos, the model worked well on ultrasound spanning several clinical domains (average SSIM 0.91+/-0.05), demonstrating foundational abilities. Together, these data expand the utility of retrospectively collected imaging and augment the dataset design toolbox for medical imaging.", "categories": [ "cs.CV", "cs.AI" ], "primary_category": "cs.CV", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03255v1", "published_date": "2025-11-05 07:32:43 UTC", "updated_date": "2025-11-05 07:32:43 UTC" }, { "arxiv_id": "2511.03251v1", "title": "GMoPE:A Prompt-Expert Mixture Framework for Graph Foundation Models", "authors": [ "Zhibin Wang", "Zhixing Zhang", "Shuqi Wang", "Xuanting Xie", "Zhao Kang" ], "abstract": "Graph Neural Networks (GNNs) have demonstrated impressive performance on task-specific benchmarks, yet their ability to generalize across diverse domains and tasks remains limited. Existing approaches often struggle with negative transfer, scalability issues, and high adaptation costs. To address these challenges, we propose GMoPE (Graph Mixture of Prompt-Experts), a novel framework that seamlessly integrates the Mixture-of-Experts (MoE) architecture with prompt-based learning for graphs. GMoPE leverages expert-specific prompt vectors and structure-aware MoE routing to enable each expert to specialize in distinct subdomains and dynamically contribute to predictions. To promote diversity and prevent expert collapse, we introduce a soft orthogonality constraint across prompt vectors, encouraging expert specialization and facilitating a more balanced expert utilization. Additionally, we adopt a prompt-only fine-tuning strategy that significantly reduces spatiotemporal complexity during transfer. We validate GMoPE through extensive experiments under various pretraining strategies and multiple downstream tasks. Results show that GMoPE consistently outperforms state-of-the-art baselines and achieves performance comparable to full parameter fine-tuning-while requiring only a fraction of the adaptation overhead. Our work provides a principled and scalable framework for advancing generalizable and efficient graph foundation models.", "categories": [ "cs.LG", "cs.AI", "cs.SI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03251v1", "published_date": "2025-11-05 07:28:51 UTC", "updated_date": "2025-11-05 07:28:51 UTC" }, { "arxiv_id": "2511.03757v1", "title": "Laugh, Relate, Engage: Stylized Comment Generation for Short Videos", "authors": [ "Xuan Ouyang", "Senan Wang", "Bouzhou Wang", "Siyuan Xiahou", "Jinrong Zhou", "Yuekang Li" ], "abstract": "Short-video platforms have become a central medium in the modern Internet landscape, where efficient information delivery and strong interactivity are reshaping user engagement and cultural dissemination. Among the various forms of user interaction, comments play a vital role in fostering community participation and enabling content re-creation. However, generating comments that are both compliant with platform guidelines and capable of exhibiting stylistic diversity and contextual awareness remains a significant challenge. We introduce LOLGORITHM, a modular multi-agent system (MAS) designed for controllable short-video comment generation. The system integrates video segmentation, contextual and affective analysis, and style-aware prompt construction. It supports six distinct comment styles: puns (homophones), rhyming, meme application, sarcasm (irony), plain humor, and content extraction. Powered by a multimodal large language model (MLLM), LOLGORITHM directly processes video inputs and achieves fine-grained style control through explicit prompt markers and few-shot examples. To support development and evaluation, we construct a bilingual dataset using official APIs from Douyin (Chinese) and YouTube (English), covering five popular video genres: comedy skits, daily life jokes, funny animal clips, humorous commentary, and talk shows. Evaluation combines automated metrics originality, relevance, and style conformity with a large-scale human preference study involving 40 videos and 105 participants. Results show that LOLGORITHM significantly outperforms baseline models, achieving preference rates of over 90% on Douyin and 87.55% on YouTube. This work presents a scalable and culturally adaptive framework for stylized comment generation on short-video platforms, offering a promising path to enhance user engagement and creative interaction.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03757v1", "published_date": "2025-11-05 07:00:22 UTC", "updated_date": "2025-11-05 07:00:22 UTC" }, { "arxiv_id": "2511.03235v1", "title": "From Five Dimensions to Many: Large Language Models as Precise and Interpretable Psychological Profilers", "authors": [ "Yi-Fei Liu", "Yi-Long Lu", "Di He", "Hang Zhang" ], "abstract": "Psychological constructs within individuals are widely believed to be interconnected. We investigated whether and how Large Language Models (LLMs) can model the correlational structure of human psychological traits from minimal quantitative inputs. We prompted various LLMs with Big Five Personality Scale responses from 816 human individuals to role-play their responses on nine other psychological scales. LLMs demonstrated remarkable accuracy in capturing human psychological structure, with the inter-scale correlation patterns from LLM-generated responses strongly aligning with those from human data $(R^2 > 0.89)$. This zero-shot performance substantially exceeded predictions based on semantic similarity and approached the accuracy of machine learning algorithms trained directly on the dataset. Analysis of reasoning traces revealed that LLMs use a systematic two-stage process: First, they transform raw Big Five responses into natural language personality summaries through information selection and compression, analogous to generating sufficient statistics. Second, they generate target scale responses based on reasoning from these summaries. For information selection, LLMs identify the same key personality factors as trained algorithms, though they fail to differentiate item importance within factors. The resulting compressed summaries are not merely redundant representations but capture synergistic information--adding them to original scores enhances prediction alignment, suggesting they encode emergent, second-order patterns of trait interplay. Our findings demonstrate that LLMs can precisely predict individual participants' psychological traits from minimal data through a process of abstraction and reasoning, offering both a powerful tool for psychological simulation and valuable insights into their emergent reasoning capabilities.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03235v1", "published_date": "2025-11-05 06:51:13 UTC", "updated_date": "2025-11-05 06:51:13 UTC" }, { "arxiv_id": "2511.03227v2", "title": "Node-Based Editing for Multimodal Generation of Text, Audio, Image, and Video", "authors": [ "Alexander Htet Kyaw", "Lenin Ravindranath Sivalingam" ], "abstract": "We present a node-based storytelling system for multimodal content generation. The system represents stories as graphs of nodes that can be expanded, edited, and iteratively refined through direct user edits and natural-language prompts. Each node can integrate text, images, audio, and video, allowing creators to compose multimodal narratives. A task selection agent routes between specialized generative tasks that handle story generation, node structure reasoning, node diagram formatting, and context generation. The interface supports targeted editing of individual nodes, automatic branching for parallel storylines, and node-based iterative refinement. Our results demonstrate that node-based editing supports control over narrative structure and iterative generation of text, images, audio, and video. We report quantitative outcomes on automatic story outline generation and qualitative observations of editing workflows. Finally, we discuss current limitations such as scalability to longer narratives and consistency across multiple nodes, and outline future work toward human-in-the-loop and user-centered creative AI tools.", "categories": [ "cs.HC", "cs.AI", "cs.MM" ], "primary_category": "cs.HC", "comment": "Accepted to NeurIPS 2025, Conference on Neural Information Processing Systems, Workshop on Generative and Protective AI for Content Creation", "pdf_url": "https://arxiv.org/pdf/2511.03227v2", "published_date": "2025-11-05 06:35:10 UTC", "updated_date": "2025-11-06 01:45:32 UTC" }, { "arxiv_id": "2511.03217v1", "title": "Hybrid Fact-Checking that Integrates Knowledge Graphs, Large Language Models, and Search-Based Retrieval Agents Improves Interpretable Claim Verification", "authors": [ "Shaghayegh Kolli", "Richard Rosenbaum", "Timo Cavelius", "Lasse Strothe", "Andrii Lata", "Jana Diesner" ], "abstract": "Large language models (LLMs) excel in generating fluent utterances but can lack reliable grounding in verified information. At the same time, knowledge-graph-based fact-checkers deliver precise and interpretable evidence, yet suffer from limited coverage or latency. By integrating LLMs with knowledge graphs and real-time search agents, we introduce a hybrid fact-checking approach that leverages the individual strengths of each component. Our system comprises three autonomous steps: 1) a Knowledge Graph (KG) Retrieval for rapid one-hop lookups in DBpedia, 2) an LM-based classification guided by a task-specific labeling prompt, producing outputs with internal rule-based logic, and 3) a Web Search Agent invoked only when KG coverage is insufficient. Our pipeline achieves an F1 score of 0.93 on the FEVER benchmark on the Supported/Refuted split without task-specific fine-tuning. To address Not enough information cases, we conduct a targeted reannotation study showing that our approach frequently uncovers valid evidence for claims originally labeled as Not Enough Information (NEI), as confirmed by both expert annotators and LLM reviewers. With this paper, we present a modular, opensource fact-checking pipeline with fallback strategies and generalization across datasets.", "categories": [ "cs.CL", "cs.AI", "cs.CY", "cs.IR" ], "primary_category": "cs.CL", "comment": "Paper has been accepted at 9th wiNLP workshop at EMNLP", "pdf_url": "https://arxiv.org/pdf/2511.03217v1", "published_date": "2025-11-05 06:10:05 UTC", "updated_date": "2025-11-05 06:10:05 UTC" }, { "arxiv_id": "2511.03214v1", "title": "LGM: Enhancing Large Language Models with Conceptual Meta-Relations and Iterative Retrieval", "authors": [ "Wenchang Lei", "Ping Zou", "Yue Wang", "Feng Sun", "Lei Zhao" ], "abstract": "Large language models (LLMs) exhibit strong semantic understanding, yet struggle when user instructions involve ambiguous or conceptually misaligned terms. We propose the Language Graph Model (LGM) to enhance conceptual clarity by extracting meta-relations-inheritance, alias, and composition-from natural language. The model further employs a reflection mechanism to validate these meta-relations. Leveraging a Concept Iterative Retrieval Algorithm, these relations and related descriptions are dynamically supplied to the LLM, improving its ability to interpret concepts and generate accurate responses. Unlike conventional Retrieval-Augmented Generation (RAG) approaches that rely on extended context windows, our method enables large language models to process texts of any length without the need for truncation. Experiments on standard benchmarks demonstrate that the LGM consistently outperforms existing RAG baselines.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "30 pages, 5 figures", "pdf_url": "https://arxiv.org/pdf/2511.03214v1", "published_date": "2025-11-05 06:04:38 UTC", "updated_date": "2025-11-05 06:04:38 UTC" }, { "arxiv_id": "2511.03211v2", "title": "Retrofitters, pragmatists and activists: Public interest litigation for accountable automated decision-making", "authors": [ "Henry Fraser", "Zahra Stardust" ], "abstract": "This paper examines the role of public interest litigation in promoting accountability for AI and automated decision-making (ADM) in Australia. Since ADM regulation faces geopolitical headwinds, effective governance will have to rely at least in part on the enforcement of existing laws. Drawing on interviews with Australian public interest litigators, technology policy activists, and technology law scholars, the paper positions public interest litigation as part of a larger ecosystem for transparency, accountability and justice with respect to ADM. It builds on one participant's characterisation of litigation about ADM as an exercise in legal retrofitting: adapting old laws to new circumstances. The paper's primary contribution is to aggregate, organise and present original insights on pragmatic strategies and tactics for effective public interest litigation about ADM. Naturally, it also contends with the limits of these strategies, and of the Australian legal system. Where limits are, however, capable of being overcome, the paper presents findings on urgent needs: the enabling institutional arrangements without which effective litigation and accountability will falter. The paper is relevant to law and technology scholars; individuals and groups harmed by ADM; public interest litigators and technology lawyers; civil society and advocacy organisations; and policymakers.", "categories": [ "cs.CY", "cs.AI" ], "primary_category": "cs.CY", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03211v2", "published_date": "2025-11-05 05:55:04 UTC", "updated_date": "2025-11-07 01:37:46 UTC" }, { "arxiv_id": "2511.03206v1", "title": "QG-CoC: Question-Guided Chain-of-Captions for Large Multimodal Models", "authors": [ "Kuei-Chun Kao", "Hsu Tzu-Yin", "Yunqi Hong", "Ruochen Wang", "Cho-Jui Hsieh" ], "abstract": "Recently, Multimodal Large Language Models (MLLMs) encounter two key issues in multi-image contexts: (1) a lack of fine-grained perception across disparate images, and (2) a diminished capability to effectively reason over and synthesize information from multiple visual inputs. However, while various prompting methods aim to describe visual content, many existing studies focus primarily on single-image settings or specific, constrained scenarios. This leaves a critical gap in understanding and addressing how MLLMs tackle more general and complex multi-image reasoning tasks. Thus, we first extensively investigate how current prompting methods perceive fine-grained visual details and process visual information when dealing with multiple images. Our findings reveal that existing prompting methods fall short in attending to needed clues and seamlessly integrating perception and reasoning. Inspired by the findings, we propose a new zero-shot prompting method, Question-Guided Chain-of-Captions (QG-CoC), a generalized prompting approach that effectively handles problems with an arbitrary number of images. We evaluate our method on various open-source and closed-source MLLMs for multi-image and single-image benchmarks. Experimental results indicate that QG-CoC demonstrates competitive performance across tasks and exhibits robust improvements in the challenging scenarios where existing prompting methods fail.", "categories": [ "cs.CV", "cs.AI", "cs.LG" ], "primary_category": "cs.CV", "comment": "16 pages", "pdf_url": "https://arxiv.org/pdf/2511.03206v1", "published_date": "2025-11-05 05:49:48 UTC", "updated_date": "2025-11-05 05:49:48 UTC" }, { "arxiv_id": "2511.03201v1", "title": "A Quantized VAE-MLP Botnet Detection Model: A Systematic Evaluation of Quantization-Aware Training and Post-Training Quantization Strategies", "authors": [ "Hassan Wasswa", "Hussein Abbass", "Timothy Lynar" ], "abstract": "In an effort to counter the increasing IoT botnet-based attacks, state-of-the-art deep learning methods have been proposed and have achieved impressive detection accuracy. However, their computational intensity restricts deployment on resource-constrained IoT devices, creating a critical need for lightweight detection models. A common solution to this challenge is model compression via quantization. This study proposes a VAE-MLP model framework where an MLP-based classifier is trained on 8-dimensional latent vectors derived from the high-dimensional train data using the encoder component of a pretrained variational autoencoder (VAE). Two widely used quantization strategies--Quantization-Aware Training (QAT) and Post-Training Quantization (PTQ)--are then systematically evaluated in terms of their impact on detection performance, storage efficiency, and inference latency using two benchmark IoT botnet datasets--N-BaIoT and CICIoT2022. The results revealed that, with respect to detection accuracy, the QAT strategy experienced a more noticeable decline,whereas PTQ incurred only a marginal reduction compared to the original unquantized model. Furthermore, PTQ yielded a 6x speedup and 21x reduction in size, while QAT achieved a 3x speedup and 24x compression, demonstrating the practicality of quantization for device-level IoT botnet detection.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03201v1", "published_date": "2025-11-05 05:33:27 UTC", "updated_date": "2025-11-05 05:33:27 UTC" }, { "arxiv_id": "2511.03190v1", "title": "Efficient Linear Attention for Multivariate Time Series Modeling via Entropy Equality", "authors": [ "Mingtao Zhang", "Guoli Yang", "Zhanxing Zhu", "Mengzhu Wang", "Xiaoying Bai" ], "abstract": "Attention mechanisms have been extensively employed in various applications, including time series modeling, owing to their capacity to capture intricate dependencies; however, their utility is often constrained by quadratic computational complexity, which impedes scalability for long sequences. In this work, we propose a novel linear attention mechanism designed to overcome these limitations. Our approach is grounded in a theoretical demonstration that entropy, as a strictly concave function on the probability simplex, implies that distributions with aligned probability rankings and similar entropy values exhibit structural resemblance. Building on this insight, we develop an efficient approximation algorithm that computes the entropy of dot-product-derived distributions with only linear complexity, enabling the implementation of a linear attention mechanism based on entropy equality. Through rigorous analysis, we reveal that the effectiveness of attention in spatio-temporal time series modeling may not primarily stem from the non-linearity of softmax but rather from the attainment of a moderate and well-balanced weight distribution. Extensive experiments on four spatio-temporal datasets validate our method, demonstrating competitive or superior forecasting performance while achieving substantial reductions in both memory usage and computational time.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03190v1", "published_date": "2025-11-05 05:07:55 UTC", "updated_date": "2025-11-05 05:07:55 UTC" }, { "arxiv_id": "2511.03186v1", "title": "Adobe Summit Concierge Evaluation with Human in the Loop", "authors": [ "Yiru Chen", "Sally Fang", "Sai Sree Harsha", "Dan Luo", "Vaishnavi Muppala", "Fei Wu", "Shun Jiang", "Kun Qian", "Yunyao Li" ], "abstract": "Generative AI assistants offer significant potential to enhance productivity, streamline information access, and improve user experience in enterprise contexts. In this work, we present Summit Concierge, a domain-specific AI assistant developed for Adobe Summit. The assistant handles a wide range of event-related queries and operates under real-world constraints such as data sparsity, quality assurance, and rapid deployment. To address these challenges, we adopt a human-in-the-loop development workflow that combines prompt engineering, retrieval grounding, and lightweight human validation. We describe the system architecture, development process, and real-world deployment outcomes. Our experience shows that agile, feedback-driven development enables scalable and reliable AI assistants, even in cold-start scenarios.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "Accepted by 6th Workshop on Data Science with Human in the Loop @ VLDB 2025", "pdf_url": "https://arxiv.org/pdf/2511.03186v1", "published_date": "2025-11-05 05:05:24 UTC", "updated_date": "2025-11-05 05:05:24 UTC" }, { "arxiv_id": "2511.03179v3", "title": "Toward Autonomous Engineering Design: A Knowledge-Guided Multi-Agent Framework", "authors": [ "Varun Kumar", "George Em Karniadakis" ], "abstract": "The engineering design process often demands expertise from multiple domains, leading to complex collaborations and iterative refinements. Traditional methods can be resource-intensive and prone to inefficiencies. To address this, we formalize the engineering design process through a multi-agent AI framework that integrates structured design and review loops. The framework introduces specialized knowledge-driven agents that collaborate to generate and refine design candidates. As an exemplar, we demonstrate its application to the aerodynamic optimization of 4-digit NACA airfoils. The framework consists of three key AI agents: a Graph Ontologist, a Design Engineer, and a Systems Engineer. The Graph Ontologist employs a Large Language Model (LLM) to construct two domain-specific knowledge graphs from airfoil design literature. The Systems Engineer, informed by a human manager, formulates technical requirements that guide design generation and evaluation. The Design Engineer leverages the design knowledge graph and computational tools to propose candidate airfoils meeting these requirements. The Systems Engineer reviews and provides feedback both qualitative and quantitative using its own knowledge graph, forming an iterative feedback loop until a design is validated by the manager. The final design is then optimized to maximize performance metrics such as the lift-to-drag ratio. Overall, this work demonstrates how collaborative AI agents equipped with structured knowledge representations can enhance efficiency, consistency, and quality in the engineering design process.", "categories": [ "cs.AI", "cs.LG", "cs.MA" ], "primary_category": "cs.AI", "comment": "Added appendices and updated literature review", "pdf_url": "https://arxiv.org/pdf/2511.03179v3", "published_date": "2025-11-05 04:55:25 UTC", "updated_date": "2025-12-30 03:15:01 UTC" }, { "arxiv_id": "2511.03173v1", "title": "Optimizing Earth-Moon Transfer and Cislunar Navigation: Integrating Low-Energy Trajectories, AI Techniques and GNSS-R Technologies", "authors": [ "Arsalan Muhammad", "Wasiu Akande Ahmed", "Omada Friday Ojonugwa", "Paul Puspendu Biswas" ], "abstract": "The rapid growth of cislunar activities, including lunar landings, the Lunar Gateway, and in-space refueling stations, requires advances in cost-efficient trajectory design and reliable integration of navigation and remote sensing. Traditional Earth-Moon transfers suffer from rigid launch windows and high propellant demands, while Earth-based GNSS systems provide little to no coverage beyond geostationary orbit. This limits autonomy and environmental awareness in cislunar space. This review compares four major transfer strategies by evaluating velocity requirements, flight durations, and fuel efficiency, and by identifying their suitability for both crewed and robotic missions. The emerging role of artificial intelligence and machine learning is highlighted: convolutional neural networks support automated crater recognition and digital terrain model generation, while deep reinforcement learning enables adaptive trajectory refinement during descent and landing to reduce risk and decision latency. The study also examines how GNSS-Reflectometry and advanced Positioning, Navigation, and Timing architectures can extend navigation capabilities beyond current limits. GNSS-R can act as a bistatic radar for mapping lunar ice, soil properties, and surface topography, while PNT systems support autonomous rendezvous, Lagrange point station-keeping, and coordinated satellite swarm operations. Combining these developments establishes a scalable framework for sustainable cislunar exploration and long-term human and robotic presence.", "categories": [ "astro-ph.EP", "cs.AI", "cs.LG", "cs.RO" ], "primary_category": "astro-ph.EP", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03173v1", "published_date": "2025-11-05 04:41:43 UTC", "updated_date": "2025-11-05 04:41:43 UTC" }, { "arxiv_id": "2511.03170v2", "title": "GraphCliff: Short-Long Range Gating for Subtle Differences but Critical Changes", "authors": [ "Hajung Kim", "Jueon Park", "Junseok Choe", "Sheunheun Baek", "Hyeon Hwang", "Jaewoo Kang" ], "abstract": "Quantitative structure-activity relationship assumes a smooth relationship between molecular structure and biological activity. However, activity cliffs defined as pairs of structurally similar compounds with large potency differences break this continuity. Recent benchmarks targeting activity cliffs have revealed that classical machine learning models with extended connectivity fingerprints outperform graph neural networks. Our analysis shows that graph embeddings fail to adequately separate structurally similar molecules in the embedding space, making it difficult to distinguish between structurally similar but functionally different molecules. Despite this limitation, molecular graph structures are inherently expressive and attractive, as they preserve molecular topology. To preserve the structural representation of molecules as graphs, we propose a new model, GraphCliff, which integrates short- and long-range information through a gating mechanism. Experimental results demonstrate that GraphCliff consistently improves performance on both non-cliff and cliff compounds. Furthermore, layer-wise node embedding analyses reveal reduced over-smoothing and enhanced discriminative power relative to strong baseline graph models.", "categories": [ "cs.CE", "cs.AI" ], "primary_category": "cs.CE", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03170v2", "published_date": "2025-11-05 04:38:32 UTC", "updated_date": "2025-11-08 02:05:57 UTC" }, { "arxiv_id": "2511.03169v1", "title": "Uncovering Bugs in Formal Explainers: A Case Study with PyXAI", "authors": [ "Xuanxiang Huang", "Yacine Izza", "Alexey Ignatiev", "Joao Marques-Silva" ], "abstract": "Formal explainable artificial intelligence (XAI) offers unique theoretical guarantees of rigor when compared to other non-formal methods of explainability. However, little attention has been given to the validation of practical implementations of formal explainers. This paper develops a novel methodology for validating formal explainers and reports on the assessment of the publicly available formal explainer PyXAI. The paper documents the existence of incorrect explanations computed by PyXAI on most of the datasets analyzed in the experiments, thereby confirming the importance of the proposed novel methodology for the validation of formal explainers.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03169v1", "published_date": "2025-11-05 04:35:20 UTC", "updated_date": "2025-11-05 04:35:20 UTC" }, { "arxiv_id": "2511.03153v1", "title": "RefAgent: A Multi-agent LLM-based Framework for Automatic Software Refactoring", "authors": [ "Khouloud Oueslati", "Maxime Lamothe", "Foutse Khomh" ], "abstract": "Large Language Models (LLMs) have substantially influenced various software engineering tasks. Indeed, in the case of software refactoring, traditional LLMs have shown the ability to reduce development time and enhance code quality. However, these LLMs often rely on static, detailed instructions for specific tasks. In contrast, LLM-based agents can dynamically adapt to evolving contexts and autonomously make decisions by interacting with software tools and executing workflows. In this paper, we explore the potential of LLM-based agents in supporting refactoring activities. Specifically, we introduce RefAgent, a multi-agent LLM-based framework for end-to-end software refactoring. RefAgent consists of specialized agents responsible for planning, executing, testing, and iteratively refining refactorings using self-reflection and tool-calling capabilities. We evaluate RefAgent on eight open-source Java projects, comparing its effectiveness against a single-agent approach, a search-based refactoring tool, and historical developer refactorings. Our assessment focuses on: (1) the impact of generated refactorings on software quality, (2) the ability to identify refactoring opportunities, and (3) the contribution of each LLM agent through an ablation study. Our results show that RefAgent achieves a median unit test pass rate of 90%, reduces code smells by a median of 52.5%, and improves key quality attributes (e.g., reusability) by a median of 8.6%. Additionally, it closely aligns with developer refactorings and the search-based tool in identifying refactoring opportunities, attaining a median F1-score of 79.15% and 72.7%, respectively. Compared to single-agent approaches, RefAgent improves the median unit test pass rate by 64.7% and the median compilation success rate by 40.1%. These findings highlight the promise of multi-agent architectures in advancing automated software refactoring.", "categories": [ "cs.SE", "cs.AI" ], "primary_category": "cs.SE", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03153v1", "published_date": "2025-11-05 03:20:58 UTC", "updated_date": "2025-11-05 03:20:58 UTC" }, { "arxiv_id": "2511.03152v1", "title": "Who Sees the Risk? Stakeholder Conflicts and Explanatory Policies in LLM-based Risk Assessment", "authors": [ "Srishti Yadav", "Jasmina Gajcin", "Erik Miehling", "Elizabeth Daly" ], "abstract": "Understanding how different stakeholders perceive risks in AI systems is essential for their responsible deployment. This paper presents a framework for stakeholder-grounded risk assessment by using LLMs, acting as judges to predict and explain risks. Using the Risk Atlas Nexus and GloVE explanation method, our framework generates stakeholder-specific, interpretable policies that shows how different stakeholders agree or disagree about the same risks. We demonstrate our method using three real-world AI use cases of medical AI, autonomous vehicles, and fraud detection domain. We further propose an interactive visualization that reveals how and why conflicts emerge across stakeholder perspectives, enhancing transparency in conflict reasoning. Our results show that stakeholder perspectives significantly influence risk perception and conflict patterns. Our work emphasizes the importance of these stakeholder-aware explanations needed to make LLM-based evaluations more transparent, interpretable, and aligned with human-centered AI governance goals.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03152v1", "published_date": "2025-11-05 03:19:21 UTC", "updated_date": "2025-11-05 03:19:21 UTC" }, { "arxiv_id": "2511.03149v1", "title": "Forecast2Anomaly (F2A): Adapting Multivariate Time Series Foundation Models for Anomaly Prediction", "authors": [ "Atif Hassan", "Tarun Kumar", "Ashish Mishra", "Sergey Serebryakov", "Satish Kumar Mopur", "Phanidhar Koganti", "Murthy Chelankuri", "Ramanagopal Vogety", "Suparna Bhattacharya", "Martin Foltin" ], "abstract": "Forecasting anomalies (anomaly prediction) in multivariate time series from different real-world, dynamic, and complex systems is vital for preempting critical failures, leading to a substantial minimization in operational costs and human labor. Yet, existing methods are limited to specific systems while failing to generalize to evolving anomaly patterns over time. In contrast, pretrained Time Series Foundation Models (TSFMs) have recently demonstrated strong generalization and zero-shot forecasting capabilities. However, their potential remains untapped for anomaly prediction, a task fundamentally different from forecasting normal behavior. Thus, we present Forecast2Anomaly (F2A), a novel framework that empowers TSFMs with anomaly prediction abilities through two key innovations. First, we propose a joint forecast-anomaly loss that fine-tunes TSFMs to accurately forecast future signals even at anomalous time points. Second, we introduce a Retrieval-Augmented Generation (RAG) module that retrieves historically relevant horizons and conditions predictions on them. This component dynamically adapts to distributional shifts at inference time, enabling F2A to track evolving anomalies without requiring model updates. By combining targeted fine-tuning with dynamic retrieval, F2A bridges the gap between robust TSFM zero-shot forecasting and zero-shot anomaly prediction. Extensive experiments across 16 diverse datasets and multiple TSFM backbones show that F2A consistently outperforms state-of-the-art methods, offering a scalable, zero-shot anomaly prediction solution for real-world applications.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03149v1", "published_date": "2025-11-05 03:13:26 UTC", "updated_date": "2025-11-05 03:13:26 UTC" }, { "arxiv_id": "2511.03143v1", "title": "From Measurement to Expertise: Empathetic Expert Adapters for Context-Based Empathy in Conversational AI Agents", "authors": [ "Erfan Shayegani", "Jina Suh", "Andy Wilson", "Nagu Rangan", "Javier Hernandez" ], "abstract": "Empathy is a critical factor in fostering positive user experiences in conversational AI. While models can display empathy, it is often generic rather than tailored to specific tasks and contexts. In this work, we introduce a novel framework for developing and evaluating context-specific empathetic large language models (LLMs). We first analyze a real-world conversational dataset consisting of 672 multi-turn conversations across 8 tasks, revealing significant differences in terms of expected and experienced empathy before and after the conversations, respectively. To help minimize this gap, we develop a synthetic multi-turn conversational generation pipeline and steer responses toward our defined empathy patterns based on the context that more closely matches users' expectations. We then train empathetic expert adapters for context-specific empathy that specialize in varying empathy levels based on the recognized task. Our empirical results demonstrate a significant gap reduction of 72.66% between perceived and desired empathy with scores increasing by an average factor of 2.43 as measured by our metrics and reward models. Additionally, our trained empathetic expert adapters demonstrate superior effectiveness in preserving empathy patterns throughout conversation turns, outperforming system prompts, which tend to dramatically diminish in impact as conversations lengthen.", "categories": [ "cs.HC", "cs.AI", "cs.CL", "cs.CY", "cs.LG" ], "primary_category": "cs.HC", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03143v1", "published_date": "2025-11-05 03:07:27 UTC", "updated_date": "2025-11-05 03:07:27 UTC" }, { "arxiv_id": "2511.03138v4", "title": "DeepKnown-Guard: A Proprietary Model-Based Safety Response Framework for AI Agents", "authors": [ "Qi Li", "Jianjun Xu", "Pingtao Wei", "Jiu Li", "Peiqiang Zhao", "Jiwei Shi", "Xuan Zhang", "Yanhui Yang", "Xiaodong Hui", "Peng Xu", "Wenqin Shao" ], "abstract": "With the widespread application of Large Language Models (LLMs), their associated security issues have become increasingly prominent, severely constraining their trustworthy deployment in critical domains. This paper proposes a novel safety response framework designed to systematically safeguard LLMs at both the input and output levels. At the input level, the framework employs a supervised fine-tuning-based safety classification model. Through a fine-grained four-tier taxonomy (Safe, Unsafe, Conditionally Safe, Focused Attention), it performs precise risk identification and differentiated handling of user queries, significantly enhancing risk coverage and business scenario adaptability, and achieving a risk recall rate of 99.3%. At the output level, the framework integrates Retrieval-Augmented Generation (RAG) with a specifically fine-tuned interpretation model, ensuring all responses are grounded in a real-time, trustworthy knowledge base. This approach eliminates information fabrication and enables result traceability. Experimental results demonstrate that our proposed safety control model achieves a significantly higher safety score on public safety evaluation benchmarks compared to the baseline model, TinyR1-Safety-8B. Furthermore, on our proprietary high-risk test set, the framework's components attained a perfect 100% safety score, validating their exceptional protective capabilities in complex risk scenarios. This research provides an effective engineering pathway for building high-security, high-trust LLM applications.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03138v4", "published_date": "2025-11-05 03:04:35 UTC", "updated_date": "2025-11-17 08:32:02 UTC" }, { "arxiv_id": "2511.03137v1", "title": "Using Multi-modal Large Language Model to Boost Fireworks Algorithm's Ability in Settling Challenging Optimization Tasks", "authors": [ "Shipeng Cen", "Ying Tan" ], "abstract": "As optimization problems grow increasingly complex and diverse, advancements in optimization techniques and paradigm innovations hold significant importance. The challenges posed by optimization problems are primarily manifested in their non-convexity, high-dimensionality, black-box nature, and other unfavorable characteristics. Traditional zero-order or first-order methods, which are often characterized by low efficiency, inaccurate gradient information, and insufficient utilization of optimization information, are ill-equipped to address these challenges effectively. In recent years, the rapid development of large language models (LLM) has led to substantial improvements in their language understanding and code generation capabilities. Consequently, the design of optimization algorithms leveraging large language models has garnered increasing attention from researchers. In this study, we choose the fireworks algorithm(FWA) as the basic optimizer and propose a novel approach to assist the design of the FWA by incorporating multi-modal large language model(MLLM). To put it simply, we propose the concept of Critical Part(CP), which extends FWA to complex high-dimensional tasks, and further utilizes the information in the optimization process with the help of the multi-modal characteristics of large language models. We focus on two specific tasks: the \\textit{traveling salesman problem }(TSP) and \\textit{electronic design automation problem} (EDA). The experimental results show that FWAs generated under our new framework have achieved or surpassed SOTA results on many problem instances.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03137v1", "published_date": "2025-11-05 03:01:54 UTC", "updated_date": "2025-11-05 03:01:54 UTC" }, { "arxiv_id": "2511.03132v3", "title": "Deploying Rapid Damage Assessments from sUAS Imagery for Disaster Response", "authors": [ "Thomas Manzini", "Priyankari Perali", "Robin R. Murphy" ], "abstract": "This paper presents the first AI/ML system for automating building damage assessment in uncrewed aerial systems (sUAS) imagery to be deployed operationally during federally declared disasters (Hurricanes Debby and Helene). In response to major disasters, sUAS teams are dispatched to collect imagery of the affected areas to assess damage; however, at recent disasters, teams collectively delivered between 47GB and 369GB of imagery per day, representing more imagery than can reasonably be transmitted or interpreted by subject matter experts in the disaster scene, thus delaying response efforts. To alleviate this data avalanche encountered in practice, computer vision and machine learning techniques are necessary. While prior work has been deployed to automatically assess damage in satellite imagery, there is no current state of practice for sUAS-based damage assessment systems, as all known work has been confined to academic settings. This work establishes the state of practice via the development and deployment of models for building damage assessment with sUAS imagery. The model development involved training on the largest known dataset of post-disaster sUAS aerial imagery, containing 21,716 building damage labels, and the operational training of 91 disaster practitioners. The best performing model was deployed during the responses to Hurricanes Debby and Helene, where it assessed a combined 415 buildings in approximately 18 minutes. This work contributes documentation of the actual use of AI/ML for damage assessment during a disaster and lessons learned to the benefit of the AI/ML research and user communities.", "categories": [ "cs.CV", "cs.AI", "cs.CY" ], "primary_category": "cs.CV", "comment": "6 pages, 4 figures, 1 table. Appearing in IAAI'26", "pdf_url": "https://arxiv.org/pdf/2511.03132v3", "published_date": "2025-11-05 02:49:15 UTC", "updated_date": "2025-12-13 00:20:17 UTC" }, { "arxiv_id": "2511.03129v1", "title": "Optimal Boundary Control of Diffusion on Graphs via Linear Programming", "authors": [ "Harbir Antil", "Rainald Löhner", "Felipe Pérez" ], "abstract": "We propose a linear programming (LP) framework for steady-state diffusion and flux optimization on geometric networks. The state variable satisfies a discrete diffusion law on a weighted, oriented graph, where conductances are scaled by edge lengths to preserve geometric fidelity. Boundary potentials act as controls that drive interior fluxes according to a linear network Laplacian. The optimization problem enforces physically meaningful sign and flux-cap constraints at all boundary edges, derived directly from a gradient bound. This yields a finite-dimensional LP whose feasible set is polyhedral, and whose boundedness and solvability follow from simple geometric or algebraic conditions on the network data.\n We prove that under the absence of negative recession directions--automatically satisfied in the presence of finite box bounds, flux caps, or sign restrictions--the LP admits a global minimizer. Several sufficient conditions guaranteeing boundedness of the feasible region are identified, covering both full-rank and rank-deficient flux maps. The analysis connects classical results such as the Minkowski--Weyl decomposition, Hoffman's bound, and the fundamental theorem of linear programming with modern network-based diffusion modeling.\n Two large-scale examples illustrate the framework: (i) A typical large stadium in a major modern city, which forms a single connected component with relatively uniform corridor widths, and a (ii) A complex street network emanating from a large, historical city center, which forms a multi-component system.", "categories": [ "math.OC", "cs.AI", "physics.comp-ph" ], "primary_category": "math.OC", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03129v1", "published_date": "2025-11-05 02:41:07 UTC", "updated_date": "2025-11-05 02:41:07 UTC" }, { "arxiv_id": "2511.03122v1", "title": "EGMOF: Efficient Generation of Metal-Organic Frameworks Using a Hybrid Diffusion-Transformer Architecture", "authors": [ "Seunghee Han", "Yeonghun Kang", "Taeun Bae", "Varinia Bernales", "Alan Aspuru-Guzik", "Jihan Kim" ], "abstract": "Designing materials with targeted properties remains challenging due to the vastness of chemical space and the scarcity of property-labeled data. While recent advances in generative models offer a promising way for inverse design, most approaches require large datasets and must be retrained for every new target property. Here, we introduce the EGMOF (Efficient Generation of MOFs), a hybrid diffusion-transformer framework that overcomes these limitations through a modular, descriptor-mediated workflow. EGMOF decomposes inverse design into two steps: (1) a one-dimensional diffusion model (Prop2Desc) that maps desired properties to chemically meaningful descriptors followed by (2) a transformer model (Desc2MOF) that generates structures from these descriptors. This modular hybrid design enables minimal retraining and maintains high accuracy even under small-data conditions. On a hydrogen uptake dataset, EGMOF achieved over 95% validity and 84% hit rate, representing significant improvements of up to 57% in validity and 14% in hit rate compared to existing methods, while remaining effective with only 1,000 training samples. Moreover, our model successfully performed conditional generation across 29 diverse property datasets, including CoREMOF, QMOF, and text-mined experimental datasets, whereas previous models have not. This work presents a data-efficient, generalizable approach to the inverse design of diverse MOFs and highlights the potential of modular inverse design workflows for broader materials discovery.", "categories": [ "cond-mat.mtrl-sci", "cs.AI", "cs.LG" ], "primary_category": "cond-mat.mtrl-sci", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03122v1", "published_date": "2025-11-05 02:14:13 UTC", "updated_date": "2025-11-05 02:14:13 UTC" }, { "arxiv_id": "2511.03121v2", "title": "Control Barrier Function for Aligning Large Language Models", "authors": [ "Yuya Miyaoka", "Masaki Inoue" ], "abstract": "This paper proposes a control-based framework for aligning large language models (LLMs) by leveraging a control barrier function (CBF) to ensure user-desirable text generation. The presented framework applies the CBF safety filter to the predicted token generated from the baseline LLM, to intervene in the generated text. The safety filter includes two significant advantages: this safety filter is an add-on type, allowing it to be used for alignment purposes without fine-tuning the baseline LLM, and if there is an evaluation model regarding the desired alignment, it can be directly applied to the filter design. The overall text-generation system is implemented with open-source language models, aiming to generate positive text.", "categories": [ "cs.CL", "cs.AI", "eess.SY" ], "primary_category": "cs.CL", "comment": "This work is an extenede version of arXiv:2408.15625 and has been submitted to the IEEE for possible publication", "pdf_url": "https://arxiv.org/pdf/2511.03121v2", "published_date": "2025-11-05 02:12:59 UTC", "updated_date": "2025-11-06 03:06:07 UTC" }, { "arxiv_id": "2511.03120v1", "title": "Image-Intrinsic Priors for Integrated Circuit Defect Detection and Novel Class Discovery via Self-Supervised Learning", "authors": [ "Botong. Zhao", "Xubin. Wang", "Shujing. Lyu", "Yue. Lu" ], "abstract": "Integrated circuit manufacturing is highly complex, comprising hundreds of process steps. Defects can arise at any stage, causing yield loss and ultimately degrading product reliability. Supervised methods require extensive human annotation and struggle with emergent categories and rare, data scarce defects. Clustering-based unsupervised methods often exhibit unstable performance due to missing priors. We propose IC DefectNCD, a support set free framework that leverages Image Intrinsic Priors in IC SEM images for defect detection and novel class discovery. We first develop Self Normal Information Guided IC Defect Detection, aggregating representative normal features via a learnable normal information extractor and using reconstruction residuals to coarsely localize defect regions. To handle saliency variations across defects, we introduce an adaptive binarization strategy that produces stable subimages focused on core defective areas. Finally, we design Self Defect Information Guided IC Defect Classification, which incorporates a soft mask guided attention mechanism to inject spatial defect priors into the teacher student model. This enhances sensitivity to defective regions, suppresses background interference, and enables recognition and classification of unseen defects. We validate the approach on a real world dataset spanning three key fabrication stages and covering 15 defect types. Experiments demonstrate robust performance on both defect detection and unseen defect classification.", "categories": [ "cs.CV", "cs.AI" ], "primary_category": "cs.CV", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03120v1", "published_date": "2025-11-05 02:10:08 UTC", "updated_date": "2025-11-05 02:10:08 UTC" }, { "arxiv_id": "2511.08606v1", "title": "Data-driven Feynman-Kac Discovery with Applications to Prediction and Data Generation", "authors": [ "Qi Feng", "Guang Lin", "Purav Matlia", "Denny Serdarevic" ], "abstract": "In this paper, we propose a novel data-driven framework for discovering probabilistic laws underlying the Feynman-Kac formula. Specifically, we introduce the first stochastic SINDy method formulated under the risk-neutral probability measure to recover the backward stochastic differential equation (BSDE) from a single pair of stock and option trajectories. Unlike existing approaches to identifying stochastic differential equations-which typically require ergodicity-our framework leverages the risk-neutral measure, thereby eliminating the ergodicity assumption and enabling BSDE recovery from limited financial time series data. Using this algorithm, we are able not only to make forward-looking predictions but also to generate new synthetic data paths consistent with the underlying probabilistic law.", "categories": [ "q-fin.MF", "cs.AI", "q-fin.CP" ], "primary_category": "q-fin.MF", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.08606v1", "published_date": "2025-11-05 01:57:01 UTC", "updated_date": "2025-11-05 01:57:01 UTC" }, { "arxiv_id": "2511.03114v1", "title": "An Augmentation Overlap Theory of Contrastive Learning", "authors": [ "Qi Zhang", "Yifei Wang", "Yisen Wang" ], "abstract": "Recently, self-supervised contrastive learning has achieved great success on various tasks. However, its underlying working mechanism is yet unclear. In this paper, we first provide the tightest bounds based on the widely adopted assumption of conditional independence. Further, we relax the conditional independence assumption to a more practical assumption of augmentation overlap and derive the asymptotically closed bounds for the downstream performance. Our proposed augmentation overlap theory hinges on the insight that the support of different intra-class samples will become more overlapped under aggressive data augmentations, thus simply aligning the positive samples (augmented views of the same sample) could make contrastive learning cluster intra-class samples together. Moreover, from the newly derived augmentation overlap perspective, we develop an unsupervised metric for the representation evaluation of contrastive learning, which aligns well with the downstream performance almost without relying on additional modules. Code is available at https://github.com/PKU-ML/GARC.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03114v1", "published_date": "2025-11-05 01:45:48 UTC", "updated_date": "2025-11-05 01:45:48 UTC" }, { "arxiv_id": "2511.03113v1", "title": "FP-AbDiff: Improving Score-based Antibody Design by Capturing Nonequilibrium Dynamics through the Underlying Fokker-Planck Equation", "authors": [ "Jiameng Chen", "Yida Xiong", "Kun Li", "Hongzhi Zhang", "Xiantao Cai", "Wenbin Hu", "Jia Wu" ], "abstract": "Computational antibody design holds immense promise for therapeutic discovery, yet existing generative models are fundamentally limited by two core challenges: (i) a lack of dynamical consistency, which yields physically implausible structures, and (ii) poor generalization due to data scarcity and structural bias. We introduce FP-AbDiff, the first antibody generator to enforce Fokker-Planck Equation (FPE) physics along the entire generative trajectory. Our method minimizes a novel FPE residual loss over the mixed manifold of CDR geometries (R^3 x SO(3)), compelling locally-learned denoising scores to assemble into a globally coherent probability flow. This physics-informed regularizer is synergistically integrated with deep biological priors within a state-of-the-art SE(3)-equivariant diffusion framework. Rigorous evaluation on the RAbD benchmark confirms that FP-AbDiff establishes a new state-of-the-art. In de novo CDR-H3 design, it achieves a mean Root Mean Square Deviation of 0.99 Å when superposing on the variable region, a 25% improvement over the previous state-of-the-art model, AbX, and the highest reported Contact Amino Acid Recovery of 39.91%. This superiority is underscored in the more challenging six-CDR co-design task, where our model delivers consistently superior geometric precision, cutting the average full-chain Root Mean Square Deviation by ~15%, and crucially, achieves the highest full-chain Amino Acid Recovery on the functionally dominant CDR-H3 loop (45.67%). By aligning generative dynamics with physical laws, FP-AbDiff enhances robustness and generalizability, establishing a principled approach for physically faithful and functionally viable antibody design.", "categories": [ "cs.LG", "cs.AI", "q-bio.QM" ], "primary_category": "cs.LG", "comment": "9 pages, 3 figures", "pdf_url": "https://arxiv.org/pdf/2511.03113v1", "published_date": "2025-11-05 01:44:37 UTC", "updated_date": "2025-11-05 01:44:37 UTC" }, { "arxiv_id": "2511.03108v1", "title": "miniF2F-Lean Revisited: Reviewing Limitations and Charting a Path Forward", "authors": [ "Azim Ospanov", "Farzan Farnia", "Roozbeh Yousefzadeh" ], "abstract": "We perform a thorough analysis of the formal and informal statements in the miniF2F benchmark from the perspective of an AI system that is tasked to participate in a math Olympiad consisting of the problems in miniF2F. In such setting, the model has to read and comprehend the problems in natural language, formalize them in Lean language, then proceed with proving the problems, and it will get credit for each problem if the formal proof corresponds to the original informal statement presented to the model. Our evaluation results reveal that the best accuracy of such pipeline can be about 36% using the SoTA models in the literature, considerably lower than the individual SoTA accuracies, 97% and 69% reported in the autoformalization and theorem proving literature. Analyzing the failure modes, we trace back a considerable portion of this drop to discrepancies between the formal and informal statements for more than half of the problems in miniF2F. We proceed with correcting all the errors, discrepancies and simplifications in formal and informal statements, and present the miniF2F-v2 with fully verified formal and informal statements and proofs. Evaluating the full theorem proving pipeline on miniF2F-v2 leads to the best accuracy of 70%, a significant improvement from the 40% on the original miniF2F, yet indicating considerable misalignment between the autoformalization models and theorem provers. Our deep analysis suggests that a higher quality benchmark can help the community better evaluate progress in the field of formal reasoning and also better diagnose the failure and success modes of autoformalization and theorem proving models. Our dataset is available at https://github.com/roozbeh-yz/miniF2F_v2.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03108v1", "published_date": "2025-11-05 01:27:49 UTC", "updated_date": "2025-11-05 01:27:49 UTC" }, { "arxiv_id": "2511.03106v1", "title": "Large language models require a new form of oversight: capability-based monitoring", "authors": [ "Katherine C. Kellogg", "Bingyang Ye", "Yifan Hu", "Guergana K. Savova", "Byron Wallace", "Danielle S. Bitterman" ], "abstract": "The rapid adoption of large language models (LLMs) in healthcare has been accompanied by scrutiny of their oversight. Existing monitoring approaches, inherited from traditional machine learning (ML), are task-based and founded on assumed performance degradation arising from dataset drift. In contrast, with LLMs, inevitable model degradation due to changes in populations compared to the training dataset cannot be assumed, because LLMs were not trained for any specific task in any given population. We therefore propose a new organizing principle guiding generalist LLM monitoring that is scalable and grounded in how these models are developed and used in practice: capability-based monitoring. Capability-based monitoring is motivated by the fact that LLMs are generalist systems whose overlapping internal capabilities are reused across numerous downstream tasks. Instead of evaluating each downstream task independently, this approach organizes monitoring around shared model capabilities, such as summarization, reasoning, translation, or safety guardrails, in order to enable cross-task detection of systemic weaknesses, long-tail errors, and emergent behaviors that task-based monitoring may miss. We describe considerations for developers, organizational leaders, and professional societies for implementing a capability-based monitoring approach. Ultimately, capability-based monitoring will provide a scalable foundation for safe, adaptive, and collaborative monitoring of LLMs and future generalist artificial intelligence models in healthcare.", "categories": [ "cs.AI" ], "primary_category": "cs.AI", "comment": "Under review", "pdf_url": "https://arxiv.org/pdf/2511.03106v1", "published_date": "2025-11-05 01:20:28 UTC", "updated_date": "2025-11-05 01:20:28 UTC" }, { "arxiv_id": "2511.03103v2", "title": "Adaptive Detection of Software Aging under Workload Shift", "authors": [ "Rafael Jose Moura Silva", "Maria Gizele Nascimento", "Fumio Machida", "Ermeson Andrade" ], "abstract": "Software aging is a phenomenon that affects long-running systems, leading to progressive performance degradation and increasing the risk of failures. To mitigate this problem, this work proposes an adaptive approach based on machine learning for software aging detection in environments subject to dynamic workload conditions. We evaluate and compare a static model with adaptive models that incorporate adaptive detectors, specifically the Drift Detection Method (DDM) and Adaptive Windowing (ADWIN), originally developed for concept drift scenarios and applied in this work to handle workload shifts. Experiments with simulated sudden, gradual, and recurring workload transitions show that static models suffer a notable performance drop when applied to unseen workload profiles, whereas the adaptive model with ADWIN maintains high accuracy, achieving an F1-Score above 0.93 in all analyzed scenarios.", "categories": [ "cs.SE", "cs.AI", "cs.LG" ], "primary_category": "cs.SE", "comment": "Simpósio em Sistemas Computacionais de Alto Desempenho (SSCAD), 242-253 (2025)", "pdf_url": "https://arxiv.org/pdf/2511.03103v2", "published_date": "2025-11-05 01:19:55 UTC", "updated_date": "2025-11-14 04:15:39 UTC" }, { "arxiv_id": "2511.03102v1", "title": "CARMA: Comprehensive Automatically-annotated Reddit Mental Health Dataset for Arabic", "authors": [ "Saad Mankarious", "Ayah Zirikly" ], "abstract": "Mental health disorders affect millions worldwide, yet early detection remains a major challenge, particularly for Arabic-speaking populations where resources are limited and mental health discourse is often discouraged due to cultural stigma. While substantial research has focused on English-language mental health detection, Arabic remains significantly underexplored, partly due to the scarcity of annotated datasets. We present CARMA, the first automatically annotated large-scale dataset of Arabic Reddit posts. The dataset encompasses six mental health conditions, such as Anxiety, Autism, and Depression, and a control group. CARMA surpasses existing resources in both scale and diversity. We conduct qualitative and quantitative analyses of lexical and semantic differences between users, providing insights into the linguistic markers of specific mental health conditions. To demonstrate the dataset's potential for further mental health analysis, we perform classification experiments using a range of models, from shallow classifiers to large language models. Our results highlight the promise of advancing mental health detection in underrepresented languages such as Arabic.", "categories": [ "cs.CL", "cs.AI" ], "primary_category": "cs.CL", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03102v1", "published_date": "2025-11-05 01:17:43 UTC", "updated_date": "2025-11-05 01:17:43 UTC" }, { "arxiv_id": "2511.04707v1", "title": "Jailbreaking in the Haystack", "authors": [ "Rishi Rajesh Shah", "Chen Henry Wu", "Shashwat Saxena", "Ziqian Zhong", "Alexander Robey", "Aditi Raghunathan" ], "abstract": "Recent advances in long-context language models (LMs) have enabled million-token inputs, expanding their capabilities across complex tasks like computer-use agents. Yet, the safety implications of these extended contexts remain unclear. To bridge this gap, we introduce NINJA (short for Needle-in-haystack jailbreak attack), a method that jailbreaks aligned LMs by appending benign, model-generated content to harmful user goals. Critical to our method is the observation that the position of harmful goals play an important role in safety. Experiments on standard safety benchmark, HarmBench, show that NINJA significantly increases attack success rates across state-of-the-art open and proprietary models, including LLaMA, Qwen, Mistral, and Gemini. Unlike prior jailbreaking methods, our approach is low-resource, transferable, and less detectable. Moreover, we show that NINJA is compute-optimal -- under a fixed compute budget, increasing context length can outperform increasing the number of trials in best-of-N jailbreak. These findings reveal that even benign long contexts -- when crafted with careful goal positioning -- introduce fundamental vulnerabilities in modern LMs.", "categories": [ "cs.CR", "cs.AI", "cs.CL", "cs.LG" ], "primary_category": "cs.CR", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.04707v1", "published_date": "2025-11-05 01:12:50 UTC", "updated_date": "2025-11-05 01:12:50 UTC" }, { "arxiv_id": "2511.03100v1", "title": "Scaling Multi-Agent Environment Co-Design with Diffusion Models", "authors": [ "Hao Xiang Li", "Michael Amir", "Amanda Prorok" ], "abstract": "The agent-environment co-design paradigm jointly optimises agent policies and environment configurations in search of improved system performance. With application domains ranging from warehouse logistics to windfarm management, co-design promises to fundamentally change how we deploy multi-agent systems. However, current co-design methods struggle to scale. They collapse under high-dimensional environment design spaces and suffer from sample inefficiency when addressing moving targets inherent to joint optimisation. We address these challenges by developing Diffusion Co-Design (DiCoDe), a scalable and sample-efficient co-design framework pushing co-design towards practically relevant settings. DiCoDe incorporates two core innovations. First, we introduce Projected Universal Guidance (PUG), a sampling technique that enables DiCoDe to explore a distribution of reward-maximising environments while satisfying hard constraints such as spatial separation between obstacles. Second, we devise a critic distillation mechanism to share knowledge from the reinforcement learning critic, ensuring that the guided diffusion model adapts to evolving agent policies using a dense and up-to-date learning signal. Together, these improvements lead to superior environment-policy pairs when validated on challenging multi-agent environment co-design benchmarks including warehouse automation, multi-agent pathfinding and wind farm optimisation. Our method consistently exceeds the state-of-the-art, achieving, for example, 39% higher rewards in the warehouse setting with 66% fewer simulation samples. This sets a new standard in agent-environment co-design, and is a stepping stone towards reaping the rewards of co-design in real world domains.", "categories": [ "cs.LG", "cs.AI", "cs.MA" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03100v1", "published_date": "2025-11-05 01:09:40 UTC", "updated_date": "2025-11-05 01:09:40 UTC" }, { "arxiv_id": "2511.03095v1", "title": "Sparse, self-organizing ensembles of local kernels detect rare statistical anomalies", "authors": [ "Gaia Grosso", "Sai Sumedh R. Hindupur", "Thomas Fel", "Samuel Bright-Thonney", "Philip Harris", "Demba Ba" ], "abstract": "Modern artificial intelligence has revolutionized our ability to extract rich and versatile data representations across scientific disciplines. Yet, the statistical properties of these representations remain poorly controlled, causing misspecified anomaly detection (AD) methods to falter. Weak or rare signals can remain hidden within the apparent regularity of normal data, creating a gap in our ability to detect and interpret anomalies. We examine this gap and identify a set of structural desiderata for detection methods operating under minimal prior information: sparsity, to enforce parsimony; locality, to preserve geometric sensitivity; and competition, to promote efficient allocation of model capacity. These principles define a class of self-organizing local kernels that adaptively partition the representation space around regions of statistical imbalance. As an instantiation of these principles, we introduce SparKer, a sparse ensemble of Gaussian kernels trained within a semi-supervised Neyman--Pearson framework to locally model the likelihood ratio between a sample that may contain anomalies and a nominal, anomaly-free reference. We provide theoretical insights into the mechanisms that drive detection and self-organization in the proposed model, and demonstrate the effectiveness of this approach on realistic high-dimensional problems of scientific discovery, open-world novelty detection, intrusion detection, and generative-model validation. Our applications span both the natural- and computer-science domains. We demonstrate that ensembles containing only a handful of kernels can identify statistically significant anomalous locations within representation spaces of thousands of dimensions, underscoring both the interpretability, efficiency and scalability of the proposed approach.", "categories": [ "cs.LG", "cs.AI" ], "primary_category": "cs.LG", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03095v1", "published_date": "2025-11-05 00:55:56 UTC", "updated_date": "2025-11-05 00:55:56 UTC" }, { "arxiv_id": "2511.03092v5", "title": "SnapStream: Efficient Long Sequence Decoding on Dataflow Accelerators", "authors": [ "Jonathan Li", "Nasim Farahini", "Evgenii Iuliugin", "Magnus Vesterlund", "Christian Häggström", "Guangtao Wang", "Shubhangi Upasani", "Ayush Sachdeva", "Rui Li", "Faline Fu", "Chen Wu", "Ayesha Siddiqua", "John Long", "Tuowen Zhao", "Matheen Musaddiq", "Håkan Zeffer", "Yun Du", "Mingran Wang", "Qinghua Li", "Bo Li", "Urmish Thakker", "Raghu Prabhakar" ], "abstract": "The proliferation of 100B+ parameter Large Language Models (LLMs) with 100k+ context length support have resulted in increasing demands for on-chip memory to support large KV caches. Techniques such as StreamingLLM and SnapKV demonstrate how to control KV cache size while maintaining model accuracy. Yet, these techniques are not commonly used within industrial deployments using frameworks like vLLM or SGLang. The reason is twofold: on one hand, the static graphs and continuous batching methodology employed by these frameworks make it difficult to admit modifications to the standard multi-head attention algorithm, while on the other hand, the accuracy implications of such techniques on modern instruction-following and reasoning models are not well understood, obfuscating the need for implementing these techniques. In this paper, we explore these accuracy implications on Llama-3.1-8B-Instruct and DeepSeek-R1, and develop SnapStream, a KV cache compression method that can be deployed at scale. We demonstrate the efficacy of SnapStream in a 16-way tensor-parallel deployment of DeepSeek-671B on SambaNova SN40L accelerators running at 128k context length and up to 1832 tokens per second in a real production setting. SnapStream enables $4\\times$ improved on-chip memory usage and introduces minimal accuracy degradation on LongBench-v2, AIME24 and LiveCodeBench. To the best of our knowledge, this is the first implementation of sparse KV attention techniques deployed in a production inference system with static graphs and continuous batching.", "categories": [ "cs.AI", "cs.AR", "cs.DC" ], "primary_category": "cs.AI", "comment": "", "pdf_url": "https://arxiv.org/pdf/2511.03092v5", "published_date": "2025-11-05 00:38:31 UTC", "updated_date": "2025-12-10 00:29:21 UTC" } ]