[
  {
    "assumptions": [],
    "authors": [
      "Yao, Shunyu",
      "Zhao, Jeffrey",
      "Yu, Dian",
      "Du, Nan",
      "Shafran, Izhak",
      "Narasimhan, Karthik",
      "Cao, Yuan"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Yao, Shunyu; Zhao, Jeffrey; Yu, Dian; Du, Nan; Shafran, Izhak; Narasimhan, Karthik; Cao, Yuan (2022). ReAct: Synergizing Reasoning and Acting in Language Models. https://arxiv.org/abs/2210.03629",
    "claims": [
      "we present \\model , a general paradigm to combine reasoning and acting with language models for solving diverse language reasoning and decision making tasks (Figure 1 )."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "we present \\model , a general paradigm to combine reasoning and acting with language models for solving diverse language reasoning and decision making tasks (Figure 1 )."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "we present \\model , a general paradigm to combine reasoning and acting with language models for solving diverse language reasoning and decision making tasks (Figure 1 ).",
        "locator": "abstract/full-text",
        "provenance_snippet": "we present \\model , a general paradigm to combine reasoning and acting with language models for solving diverse language reasoning and decision making tasks (Figure 1 ).",
        "source_ref": "https://arxiv.org/abs/2210.03629"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_0baffed4a75c",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2210.03629",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2210.03629.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2210.03629: ReAct: Synergizing Reasoning and Acting in Language Models",
    "theorem_proof_scaffolds": [],
    "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    "url": "https://arxiv.org/abs/2210.03629",
    "venue": "arXiv",
    "year": 2022
  },
  {
    "assumptions": [],
    "authors": [
      "Gao, Luyu",
      "Madaan, Aman",
      "Zhou, Shuyan",
      "Alon, Uri",
      "Liu, Pengfei",
      "Yang, Yiming",
      "Callan, Jamie",
      "Neubig, Graham"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Gao, Luyu; Madaan, Aman; Zhou, Shuyan; Alon, Uri; Liu, Pengfei; Yang, Yiming; Callan, Jamie; Neubig, Graham (2022). PAL: Program-aided Language Models. https://arxiv.org/abs/2211.10435",
    "claims": [
      "we present Program-Aided Language models ( PaL ): a novel approach that uses the LLM to read natural language problems and generate programs as the intermediate reasoning steps, but offloads the solution step to a runtime such as a Python interpreter."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "we present Program-Aided Language models ( PaL ): a novel approach that uses the LLM to read natural language problems and generate programs as the intermediate reasoning steps, but offloads the solution step to a runtime such as a Python interpreter."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "we present Program-Aided Language models ( PaL ): a novel approach that uses the LLM to read natural language problems and generate programs as the intermediate reasoning steps, but offloads the solution step to a runtime such as a Python interpreter.",
        "locator": "abstract/full-text",
        "provenance_snippet": "we present Program-Aided Language models ( PaL ): a novel approach that uses the LLM to read natural language problems and generate programs as the intermediate reasoning steps, but offloads the solution step to a runtime such as a Python i",
        "source_ref": "https://arxiv.org/abs/2211.10435"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_b7887b2b5ae2",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2211.10435",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2211.10435.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2211.10435: PAL: Program-aided Language Models",
    "theorem_proof_scaffolds": [],
    "title": "PAL: Program-aided Language Models",
    "url": "https://arxiv.org/abs/2211.10435",
    "venue": "arXiv",
    "year": 2022
  },
  {
    "assumptions": [],
    "authors": [
      "Schick, Timo",
      "Dwivedi-Yu, Jane",
      "Dess\u00ec, Roberto",
      "Raileanu, Roberta",
      "Lomeli, Maria",
      "Zettlemoyer, Luke",
      "Cancedda, Nicola",
      "Scialom, Thomas"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Schick, Timo; Dwivedi-Yu, Jane; Dess\u00ec, Roberto; Raileanu, Roberta; Lomeli, Maria; Zettlemoyer, Luke; Cancedda, Nicola; Scialom, Thomas (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. https://arxiv.org/abs/2302.04761",
    "claims": [
      "We introduce Toolformer , a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "We introduce Toolformer , a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We introduce Toolformer , a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction.",
        "locator": "abstract/full-text",
        "provenance_snippet": "We introduce Toolformer , a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction.",
        "source_ref": "https://arxiv.org/abs/2302.04761"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_e2afe39a9d1b",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2302.04761",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2302.04761.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2302.04761: Toolformer: Language Models Can Teach Themselves to Use Tools",
    "theorem_proof_scaffolds": [],
    "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    "url": "https://arxiv.org/abs/2302.04761",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Shen, Yongliang",
      "Song, Kaitao",
      "Tan, Xu",
      "Li, Dongsheng",
      "Lu, Weiming",
      "Zhuang, Yueting"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Shen, Yongliang; Song, Kaitao; Tan, Xu; Li, Dongsheng; Lu, Weiming; Zhuang, Yueting (2023). HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face. https://arxiv.org/abs/2303.17580",
    "claims": [
      "we present HuggingGPT, an LLM-powered agent that leverages LLMs (e."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "we present HuggingGPT, an LLM-powered agent that leverages LLMs (e."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "we present HuggingGPT, an LLM-powered agent that leverages LLMs (e.",
        "locator": "abstract/full-text",
        "provenance_snippet": "we present HuggingGPT, an LLM-powered agent that leverages LLMs (e.",
        "source_ref": "https://arxiv.org/abs/2303.17580"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_57a6127f412a",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2303.17580",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2303.17580.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2303.17580: HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face",
    "theorem_proof_scaffolds": [],
    "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face",
    "url": "https://arxiv.org/abs/2303.17580",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Liu, Bo",
      "Jiang, Yuqian",
      "Zhang, Xiaohan",
      "Liu, Qiang",
      "Zhang, Shiqi",
      "Biswas, Joydeep",
      "Stone, Peter"
    ],
    "baseline_details": [],
    "citation": "Liu, Bo; Jiang, Yuqian; Zhang, Xiaohan; Liu, Qiang; Zhang, Shiqi; Biswas, Joydeep; Stone, Peter (2023). LLM+P: Empowering Large Language Models with Optimal Planning Proficiency. https://arxiv.org/abs/2304.11477",
    "claims": [
      "Large language models (LLMs) have demonstrated remarkable zero-shot generalization abilities: state-of-the-art chatbots can provide plausible answers to many common questions that arise in daily life. However, so far, LLMs cannot reliably solve long-horizon planning problems. By contrast, classical planners, once a problem is given in a formatted way, can use efficient search algorithms to quickly identify correct, or even optimal, plans. In an effort to get the best of both worlds, this paper introduces LLM+P, the first framework that incorporates the strengths of classical planners into LLMs. LLM+P takes in a natural language description of a planning problem, then returns a correct (or optimal) plan for solving that problem in natural language. LLM+P does so by first converting the language description into a file written in the planning domain definition language (PDDL), then leveraging classical planners to quickly find a solution, and then translating the found solution back into natural language. Along with LLM+P, we define a diverse set of different benchmark problems taken from common planning scenarios. Via a comprehensive set of experiments on these benchmark problems, we find that LLM+P is able to provide optimal solutions for most problems, while LLMs fail to provide even feasible plans for most problems.\\footnote{The code and results are publicly available at https://github.com/Cranial-XIX/llm-pddl.git."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Large language models (LLMs) have demonstrated remarkable zero-shot generalization abilities: state-of-the-art chatbots can provide plausible answers to many common questions that arise in daily life. However, so far, LLMs cannot reliably solve long-horizon planning problems. By contrast, classical planners, once a problem is given in a formatted way, can use efficient search algorithms to quickly identify correct, or even optimal, plans. In an effort to get the best of both worlds, this paper introduces LLM+P, the first framework that incorporates the strengths of classical planners into LLMs. LLM+P takes in a natural language description of a planning problem, then returns a correct (or optimal) plan for solving that problem in natural language. LLM+P does so by first converting the language description into a file written in the planning domain definition language (PDDL), then leveraging classical planners to quickly find a solution, and then translating the found solution back into natural language. Along with LLM+P, we define a diverse set of different benchmark problems taken from common planning scenarios. Via a comprehensive set of experiments on these benchmark problems, we find that LLM+P is able to provide optimal solutions for most problems, while LLMs fail to provide even feasible plans for most problems.\\footnote{The code and results are publicly available at https://github.com/Cranial-XIX/llm-pddl.git."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large language models (LLMs) have demonstrated remarkable zero-shot generalization abilities: state-of-the-art chatbots can provide plausible answers to many common questions that arise in daily life. However, so far, LLMs cannot reliably solve long-horizon planning problems. By contrast, classical planners, once a problem is given in a formatted way, can use efficient search algorithms to quickly identify correct, o",
        "locator": "abstract",
        "provenance_snippet": "Large language models (LLMs) have demonstrated remarkable zero-shot generalization abilities: state-of-the-art chatbots can provide plausible answers to many common questions that arise in daily life. However, so far, LL",
        "source_ref": "https://arxiv.org/abs/2304.11477"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_fab8a7e15294",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Introduces or evaluates on a benchmark with interactive tasks.",
      "Analyzes planning or long-horizon reasoning for agent tasks."
    ],
    "provenance_notes": [
      "full_text_path: knowledge/papers/2304.11477_full.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large language models (LLMs) have demonstrated remarkable zero-shot generalization abilities: state-of-the-art chatbots can provide plausible answers to many common questions that arise in daily life. However, so far, LLMs cannot reliably solve long-horizon planning problems. By contrast, classical planners, once a problem is given in a formatted way, can use efficient search algorithms to quickly identify correct, or even optimal, plans. In an effort to get the best of both worlds, this paper introduces LLM+P, the first framework that incorporates the strengths of classical planners into LLMs. LLM+P takes in a natural language description of a planning problem, then returns a correct (or optimal) plan for solving that problem in natural language. LLM+P does so by first converting the language description into a file written in the planning domain definition language (PDDL), then leveraging classical planners to quickly find a solution, and then translating the found solution back into natural language. Along with LLM+P, we define a diverse set of different benchmark problems taken from common planning scenarios. Via a comprehensive set of experiments on these benchmark problems, we find that LLM+P is able to provide optimal solutions for most problems, while LLMs fail to provide even feasible plans for most problems.\\footnote{The code and results are publicly available at https://github.com/Cranial-XIX/llm-pddl.git.",
    "theorem_proof_scaffolds": [],
    "title": "LLM+P: Empowering Large Language Models with Optimal Planning Proficiency",
    "url": "https://arxiv.org/abs/2304.11477",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Yao, Shunyu",
      "Yu, Dian",
      "Zhao, Jeffrey",
      "Shafran, Izhak",
      "Griffiths, Thomas L.",
      "Cao, Yuan",
      "Narasimhan, Karthik"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Yao, Shunyu; Yu, Dian; Zhao, Jeffrey; Shafran, Izhak; Griffiths, Thomas L.; Cao, Yuan; Narasimhan, Karthik (2023). Tree of Thoughts: Deliberate Problem Solving with Large Language Models. https://arxiv.org/abs/2305.10601",
    "claims": [
      "we propose three new problems that challenge existing LM inference methods even with the state-of-the-art language model, GPT-4 [ 23 ] : Game of 24, Creative Writing, and Crosswords (Table 1 )."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "we propose three new problems that challenge existing LM inference methods even with the state-of-the-art language model, GPT-4 [ 23 ] : Game of 24, Creative Writing, and Crosswords (Table 1 )."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "we propose three new problems that challenge existing LM inference methods even with the state-of-the-art language model, GPT-4 [ 23 ] : Game of 24, Creative Writing, and Crosswords (Table 1 ).",
        "locator": "abstract/full-text",
        "provenance_snippet": "we propose three new problems that challenge existing LM inference methods even with the state-of-the-art language model, GPT-4 [ 23 ] : Game of 24, Creative Writing, and Crosswords (Table 1 ).",
        "source_ref": "https://arxiv.org/abs/2305.10601"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_d2c123e2329d",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2305.10601",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2305.10601.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2305.10601: Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    "theorem_proof_scaffolds": [],
    "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    "url": "https://arxiv.org/abs/2305.10601",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Patil, Shishir G.",
      "Zhang, Tianjun",
      "Wang, Xin",
      "Gonzalez, Joseph E."
    ],
    "baseline_details": [],
    "citation": "Patil, Shishir G.; Zhang, Tianjun; Wang, Xin; Gonzalez, Joseph E. (2023). Gorilla: Large Language Model Connected with Massive APIs. https://arxiv.org/abs/2305.15334",
    "claims": [
      "Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and their tendency to hallucinate the wrong usage of an API call. We release Gorilla, a finetuned LLaMA-based model that surpasses the performance of GPT-4 on writing API calls. When combined with a document retriever, Gorilla demonstrates a strong capability to adapt to test-time document changes, enabling flexible user updates or version changes. It also substantially mitigates the issue of hallucination, commonly encountered when prompting LLMs directly. To evaluate the model's ability, we introduce APIBench, a comprehensive dataset consisting of HuggingFace, TorchHub, and TensorHub APIs. The successful integration of the retrieval system with Gorilla demonstrates the potential for LLMs to use tools more accurately, keep up with frequently updated documentation, and consequently increase the reliability and applicability of their outputs. Gorilla's code, model, data, and demo are available at https://gorilla.cs.berkeley.edu"
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and their tendency to hallucinate the wrong usage of an API call. We release Gorilla, a finetuned LLaMA-based model that surpasses the performance of GPT-4 on writing API calls. When combined with a document retriever, Gorilla demonstrates a strong capability to adapt to test-time document changes, enabling flexible user updates or version changes. It also substantially mitigates the issue of hallucination, commonly encountered when prompting LLMs directly. To evaluate the model's ability, we introduce APIBench, a comprehensive dataset consisting of HuggingFace, TorchHub, and TensorHub APIs. The successful integration of the retrieval system with Gorilla demonstrates the potential for LLMs to use tools more accurately, keep up with frequently updated documentation, and consequently increase the reliability and applicability of their outputs. Gorilla's code, model, data, and demo are available at https://gorilla.cs.berkeley.edu"
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and",
        "locator": "abstract",
        "provenance_snippet": "Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effective",
        "source_ref": "https://arxiv.org/abs/2305.15334"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_11400d800724",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Studies tool-use behavior for LLM agents."
    ],
    "provenance_notes": [
      "full_text_path: knowledge/papers/2305.15334_full.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and their tendency to hallucinate the wrong usage of an API call. We release Gorilla, a finetuned LLaMA-based model that surpasses the performance of GPT-4 on writing API calls. When combined with a document retriever, Gorilla demonstrates a strong capability to adapt to test-time document changes, enabling flexible user updates or version changes. It also substantially mitigates the issue of hallucination, commonly encountered when prompting LLMs directly. To evaluate the model's ability, we introduce APIBench, a comprehensive dataset consisting of HuggingFace, TorchHub, and TensorHub APIs. The successful integration of the retrieval system with Gorilla demonstrates the potential for LLMs to use tools more accurately, keep up with frequently updated documentation, and consequently increase the reliability and applicability of their outputs. Gorilla's code, model, data, and demo are available at https://gorilla.cs.berkeley.edu",
    "theorem_proof_scaffolds": [],
    "title": "Gorilla: Large Language Model Connected with Massive APIs",
    "url": "https://arxiv.org/abs/2305.15334",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Wang, Guanzhi",
      "Xie, Yuqi",
      "Jiang, Yunfan",
      "Mandlekar, Ajay",
      "Xiao, Chaowei",
      "Zhu, Yuke",
      "Fan, Linxi",
      "Anandkumar, Anima"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Wang, Guanzhi; Xie, Yuqi; Jiang, Yunfan; Mandlekar, Ajay; Xiao, Chaowei; Zhu, Yuke; Fan, Linxi; Anandkumar, Anima (2023). Voyager: An Open-Ended Embodied Agent with Large Language Models. https://arxiv.org/abs/2305.16291",
    "claims": [
      "We introduce Voyager , the first LLM-powered embodied lifelong learning agent in Minecraft that continuously explores the world, acquires diverse skills, and makes novel discoveries without human intervention."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "We introduce Voyager , the first LLM-powered embodied lifelong learning agent in Minecraft that continuously explores the world, acquires diverse skills, and makes novel discoveries without human intervention."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We introduce Voyager , the first LLM-powered embodied lifelong learning agent in Minecraft that continuously explores the world, acquires diverse skills, and makes novel discoveries without human intervention.",
        "locator": "abstract/full-text",
        "provenance_snippet": "We introduce Voyager , the first LLM-powered embodied lifelong learning agent in Minecraft that continuously explores the world, acquires diverse skills, and makes novel discoveries without human intervention.",
        "source_ref": "https://arxiv.org/abs/2305.16291"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_1977ace3e0de",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2305.16291",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2305.16291.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2305.16291: Voyager: An Open-Ended Embodied Agent with Large Language Models",
    "theorem_proof_scaffolds": [],
    "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    "url": "https://arxiv.org/abs/2305.16291",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Liu, Xiao",
      "Yu, Hao",
      "Zhang, Hanchen",
      "Xu, Yifan",
      "Lei, Xuanyu",
      "Lai, Hanyu",
      "Gu, Yu",
      "Ding, Hangliang",
      "Men, Kaiwen",
      "Yang, Kejuan",
      "Zhang, Shudan",
      "Deng, Xiang",
      "Zeng, Aohan",
      "Du, Zhengxiao",
      "Zhang, Chenhui",
      "Shen, Sheng",
      "Zhang, Tianjun",
      "Su, Yu",
      "Sun, Huan",
      "Huang, Minlie",
      "Dong, Yuxiao",
      "Tang, Jie"
    ],
    "baseline_details": [],
    "citation": "Liu, Xiao; Yu, Hao; Zhang, Hanchen; Xu, Yifan; Lei, Xuanyu; Lai, Hanyu; Gu, Yu; Ding, Hangliang; Men, Kaiwen; Yang, Kejuan; Zhang, Shudan; Deng, Xiang; Zeng, Aohan; Du, Zhengxiao; Zhang, Chenhui; Shen, Sheng; Zhang, Tianjun; Su, Yu; Sun, Huan; Huang, Minlie; Dong, Yuxiao; Tang, Jie (2023). AgentBench: Evaluating LLMs as Agents. https://arxiv.org/abs/2308.03688",
    "claims": [
      "The potential of Large Language Model (LLM) as agents has been widely acknowledged recently. Thus, there is an urgent need to quantitatively \\textit{evaluate LLMs as agents} on challenging tasks in interactive environments. We present AgentBench, a multi-dimensional benchmark that consists of 8 distinct environments to assess LLM-as-Agent's reasoning and decision-making abilities. Our extensive test over \\num API-based and open-sourced (OSS) LLMs shows that, while top commercial LLMs present a strong ability of acting as agents in complex environments, there is a significant disparity in performance between them and many OSS competitors that are no larger than 70B. We identify the typical reasons of failures in environments and LLMs, showing that poor long-term reasoning, decision-making, and instruction following abilities are the main obstacles for developing usable LLM agents. Improving instruction following and training on high quality multi-round alignment data could improve agent performance. And different from existing assumptions, training on code present ambivalent impacts on different agent tasks. Datasets, environments, and an integrated evaluation package for AgentBench are released at https://github.com/THUDM/AgentBench."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "The potential of Large Language Model (LLM) as agents has been widely acknowledged recently. Thus, there is an urgent need to quantitatively \\textit{evaluate LLMs as agents} on challenging tasks in interactive environments. We present AgentBench, a multi-dimensional benchmark that consists of 8 distinct environments to assess LLM-as-Agent's reasoning and decision-making abilities. Our extensive test over \\num API-based and open-sourced (OSS) LLMs shows that, while top commercial LLMs present a strong ability of acting as agents in complex environments, there is a significant disparity in performance between them and many OSS competitors that are no larger than 70B. We identify the typical reasons of failures in environments and LLMs, showing that poor long-term reasoning, decision-making, and instruction following abilities are the main obstacles for developing usable LLM agents. Improving instruction following and training on high quality multi-round alignment data could improve agent performance. And different from existing assumptions, training on code present ambivalent impacts on different agent tasks. Datasets, environments, and an integrated evaluation package for AgentBench are released at https://github.com/THUDM/AgentBench."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "The potential of Large Language Model (LLM) as agents has been widely acknowledged recently. Thus, there is an urgent need to quantitatively \\textit{evaluate LLMs as agents} on challenging tasks in interactive environments. We present AgentBench, a multi-dimensional benchmark that consists of 8 distinct environments to assess LLM-as-Agent's reasoning and decision-making abilities. Our extensive test over \\num API-bas",
        "locator": "abstract",
        "provenance_snippet": "The potential of Large Language Model (LLM) as agents has been widely acknowledged recently. Thus, there is an urgent need to quantitatively \\textit{evaluate LLMs as agents} on challenging tasks in interactive environmen",
        "source_ref": "https://arxiv.org/abs/2308.03688"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_1bb96f4eca55",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Introduces or evaluates on a benchmark with interactive tasks.",
      "Uses interactive environments for evaluation."
    ],
    "provenance_notes": [
      "full_text_path: knowledge/papers/2308.03688_full.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "The potential of Large Language Model (LLM) as agents has been widely acknowledged recently. Thus, there is an urgent need to quantitatively \\textit{evaluate LLMs as agents} on challenging tasks in interactive environments. We present AgentBench, a multi-dimensional benchmark that consists of 8 distinct environments to assess LLM-as-Agent's reasoning and decision-making abilities. Our extensive test over \\num API-based and open-sourced (OSS) LLMs shows that, while top commercial LLMs present a strong ability of acting as agents in complex environments, there is a significant disparity in performance between them and many OSS competitors that are no larger than 70B. We identify the typical reasons of failures in environments and LLMs, showing that poor long-term reasoning, decision-making, and instruction following abilities are the main obstacles for developing usable LLM agents. Improving instruction following and training on high quality multi-round alignment data could improve agent performance. And different from existing assumptions, training on code present ambivalent impacts on different agent tasks. Datasets, environments, and an integrated evaluation package for AgentBench are released at https://github.com/THUDM/AgentBench.",
    "theorem_proof_scaffolds": [],
    "title": "AgentBench: Evaluating LLMs as Agents",
    "url": "https://arxiv.org/abs/2308.03688",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Wu, Qingyun",
      "Bansal, Gagan",
      "Zhang, Jieyu",
      "Wu, Yiran",
      "Li, Beibin",
      "Zhu, Erkang",
      "Jiang, Li",
      "Zhang, Xiaoyun",
      "Zhang, Shaokun",
      "Liu, Jiale",
      "Awadallah, Ahmed Hassan",
      "White, Ryen W",
      "Burger, Doug",
      "Wang, Chi"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Wu, Qingyun; Bansal, Gagan; Zhang, Jieyu; Wu, Yiran; Li, Beibin; Zhu, Erkang; Jiang, Li; Zhang, Xiaoyun; Zhang, Shaokun; Liu, Jiale; Awadallah, Ahmed Hassan; White, Ryen W; Burger, Doug; Wang, Chi (2023). AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation. https://arxiv.org/abs/2308.08155",
    "claims": [
      "we present AutoGen , a generalized multi-agent conversation framework (Figure 1 ), based on the following new concepts."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "we present AutoGen , a generalized multi-agent conversation framework (Figure 1 ), based on the following new concepts."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "we present AutoGen , a generalized multi-agent conversation framework (Figure 1 ), based on the following new concepts.",
        "locator": "abstract/full-text",
        "provenance_snippet": "we present AutoGen , a generalized multi-agent conversation framework (Figure 1 ), based on the following new concepts.",
        "source_ref": "https://arxiv.org/abs/2308.08155"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_7be0501cb1dc",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2308.08155",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2308.08155.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2308.08155: AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "theorem_proof_scaffolds": [],
    "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "url": "https://arxiv.org/abs/2308.08155",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Besta, Maciej",
      "Blach, Nils",
      "Kubicek, Ales",
      "Gerstenberger, Robert",
      "Podstawski, Michal",
      "Gianinazzi, Lukas",
      "Gajda, Joanna",
      "Lehmann, Tomasz",
      "Niewiadomski, Hubert",
      "Nyczyk, Piotr",
      "Hoefler, Torsten"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Besta, Maciej; Blach, Nils; Kubicek, Ales; Gerstenberger, Robert; Podstawski, Michal; Gianinazzi, Lukas; Gajda, Joanna; Lehmann, Tomasz; Niewiadomski, Hubert; Nyczyk, Piotr; Hoefler, Torsten (2023). Graph of Thoughts: Solving Elaborate Problems with Large Language Models. https://arxiv.org/abs/2308.09687",
    "claims": [
      "We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-of-Thought or Tree of Thoughts (ToT)."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-of-Thought or Tree of Thoughts (ToT)."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-of-Thought or Tree of Thoughts (ToT).",
        "locator": "abstract/full-text",
        "provenance_snippet": "We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-of-Thought or Tree of Thoughts (ToT).",
        "source_ref": "https://arxiv.org/abs/2308.09687"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_3f9105492cf9",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2308.09687",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2308.09687.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2308.09687: Graph of Thoughts: Solving Elaborate Problems with Large Language Models",
    "theorem_proof_scaffolds": [],
    "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models",
    "url": "https://arxiv.org/abs/2308.09687",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Khattab, Omar",
      "Singhvi, Arnav",
      "Maheshwari, Paridhi",
      "Zhang, Zhiyuan",
      "Santhanam, Keshav",
      "Vardhamanan, Sri",
      "Haq, Saiful",
      "Sharma, Ashutosh",
      "Joshi, Thomas T.",
      "Moazam, Hanna",
      "Miller, Heather",
      "Zaharia, Matei",
      "Potts, Christopher"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Khattab, Omar; Singhvi, Arnav; Maheshwari, Paridhi; Zhang, Zhiyuan; Santhanam, Keshav; Vardhamanan, Sri; Haq, Saiful; Sharma, Ashutosh; Joshi, Thomas T.; Moazam, Hanna; Miller, Heather; Zaharia, Matei; Potts, Christopher (2023). DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines. https://arxiv.org/abs/2310.03714",
    "claims": [
      "we introduce DSPy, a programming model that abstracts LM pipelines as text transformation graphs , i."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "we introduce DSPy, a programming model that abstracts LM pipelines as text transformation graphs , i."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "we introduce DSPy, a programming model that abstracts LM pipelines as text transformation graphs , i.",
        "locator": "abstract/full-text",
        "provenance_snippet": "we introduce DSPy, a programming model that abstracts LM pipelines as text transformation graphs , i.",
        "source_ref": "https://arxiv.org/abs/2310.03714"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_23021c8a5f6c",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2310.03714",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2310.03714.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2310.03714: DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    "theorem_proof_scaffolds": [],
    "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    "url": "https://arxiv.org/abs/2310.03714",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Zhou, Andy",
      "Yan, Kai",
      "Shlapentokh-Rothman, Michal",
      "Wang, Haohan",
      "Wang, Yu-Xiong"
    ],
    "baseline_details": [],
    "citation": "Zhou, Andy; Yan, Kai; Shlapentokh-Rothman, Michal; Wang, Haohan; Wang, Yu-Xiong (2023). Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models. https://arxiv.org/abs/2310.04406",
    "claims": [
      "While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate Monte Carlo Tree Search into LATS to enable LMs as agents, along with LM-powered value functions and self-reflections for proficient exploration and enhanced decision-making. A key feature of our approach is the incorporation of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that surpasses the constraints of existing techniques. Our experimental evaluation across diverse domains, including programming, interactive question-answering (QA), web navigation, and math, validates the effectiveness and generality of LATS in decision-making while maintaining competitive or improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1 accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates gradient-free performance (average score of 75.9) comparable to gradient-based fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at https://github.com/lapisrocks/LanguageAgentTreeSearch"
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate Monte Carlo Tree Search into LATS to enable LMs as agents, along with LM-powered value functions and self-reflections for proficient exploration and enhanced decision-making. A key feature of our approach is the incorporation of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that surpasses the constraints of existing techniques. Our experimental evaluation across diverse domains, including programming, interactive question-answering (QA), web navigation, and math, validates the effectiveness and generality of LATS in decision-making while maintaining competitive or improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1 accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates gradient-free performance (average score of 75.9) comparable to gradient-based fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at https://github.com/lapisrocks/LanguageAgentTreeSearch"
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate ",
        "locator": "abstract",
        "provenance_snippet": "While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language",
        "source_ref": "https://arxiv.org/abs/2310.04406"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_a8ab719bfa87",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Analyzes planning or long-horizon reasoning for agent tasks.",
      "Uses interactive environments for evaluation."
    ],
    "provenance_notes": [
      "full_text_path: knowledge/papers/2310.04406_full.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate Monte Carlo Tree Search into LATS to enable LMs as agents, along with LM-powered value functions and self-reflections for proficient exploration and enhanced decision-making. A key feature of our approach is the incorporation of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that surpasses the constraints of existing techniques. Our experimental evaluation across diverse domains, including programming, interactive question-answering (QA), web navigation, and math, validates the effectiveness and generality of LATS in decision-making while maintaining competitive or improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1 accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates gradient-free performance (average score of 75.9) comparable to gradient-based fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at https://github.com/lapisrocks/LanguageAgentTreeSearch",
    "theorem_proof_scaffolds": [],
    "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models",
    "url": "https://arxiv.org/abs/2310.04406",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Jimenez, Carlos E.",
      "Yang, John",
      "Wettig, Alexander",
      "Yao, Shunyu",
      "Pei, Kexin",
      "Press, Ofir",
      "Narasimhan, Karthik"
    ],
    "baseline_details": [],
    "citation": "Jimenez, Carlos E.; Yang, John; Wettig, Alexander; Yao, Shunyu; Pei, Kexin; Press, Ofir; Narasimhan, Karthik (2023). SWE-bench: Can Language Models Resolve Real-World GitHub Issues?. https://arxiv.org/abs/2310.06770",
    "claims": [
      "Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a language model is tasked with editing the codebase to address the issue. Resolving issues in SWE-bench frequently requires understanding and coordinating changes across multiple functions, classes, and even files simultaneously, calling for models to interact with execution environments, process extremely long contexts and perform complex reasoning that goes far beyond traditional code generation tasks. Our evaluations show that both state-of-the-art proprietary models and our fine-tuned model SWE-Llama can resolve only the simplest issues. The best-performing model, Claude 2, is able to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps towards LMs that are more practical, intelligent, and autonomous."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a language model is tasked with editing the codebase to address the issue. Resolving issues in SWE-bench frequently requires understanding and coordinating changes across multiple functions, classes, and even files simultaneously, calling for models to interact with execution environments, process extremely long contexts and perform complex reasoning that goes far beyond traditional code generation tasks. Our evaluations show that both state-of-the-art proprietary models and our fine-tuned model SWE-Llama can resolve only the simplest issues. The best-performing model, Claude 2, is able to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps towards LMs that are more practical, intelligent, and autonomous."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering prob",
        "locator": "abstract",
        "provenance_snippet": "Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, ",
        "source_ref": "https://arxiv.org/abs/2310.06770"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_e0cb42ccf284",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Uses interactive environments for evaluation."
    ],
    "provenance_notes": [
      "full_text_path: knowledge/papers/2310.06770_full.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a language model is tasked with editing the codebase to address the issue. Resolving issues in SWE-bench frequently requires understanding and coordinating changes across multiple functions, classes, and even files simultaneously, calling for models to interact with execution environments, process extremely long contexts and perform complex reasoning that goes far beyond traditional code generation tasks. Our evaluations show that both state-of-the-art proprietary models and our fine-tuned model SWE-Llama can resolve only the simplest issues. The best-performing model, Claude 2, is able to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps towards LMs that are more practical, intelligent, and autonomous.",
    "theorem_proof_scaffolds": [],
    "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    "url": "https://arxiv.org/abs/2310.06770",
    "venue": "arXiv",
    "year": 2023
  },
  {
    "assumptions": [],
    "authors": [
      "Weizhou Shen",
      "Chenliang Li",
      "Hongzhan Chen",
      "Ming Yan",
      "Xiaojun Quan",
      "Hehong Chen",
      "Ji Zhang",
      "Fei Huang"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Weizhou Shen; Chenliang Li; Hongzhan Chen; Ming Yan; Xiaojun Quan; Hehong Chen; Ji Zhang; Fei Huang (2024). Small LLMs Are Weak Tool Learners: A Multi-LLM Agent. https://arxiv.org/abs/2401.07324",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.",
      "To effectively train this framework, we introduce a two-stage training paradigm."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.",
        "locator": "abstract",
        "provenance_snippet": "To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.",
        "source_ref": "https://doi.org/10.48550/arxiv.2401.07324"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advantages in tool learning.",
        "locator": "abstract",
        "provenance_snippet": "Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advanta",
        "source_ref": "https://doi.org/10.48550/arxiv.2401.07324"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_82d98f4e68f9",
    "key_equations": [],
    "limitations": [
      "The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.",
      "While traditional works focus on training a single LLM with all these capabilities, performance limitations become apparent, particularly with smaller models.",
      "To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete various tasks in a self-directed fashion. The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.",
    "theorem_proof_scaffolds": [],
    "title": "Small LLMs Are Weak Tool Learners: A Multi-LLM Agent",
    "url": "https://arxiv.org/abs/2401.07324",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [
      "LLMs can operate as semantic parsers for natural and formal instructions.",
      "Trajectory-level node distribution comparisons can evaluate workflow quality."
    ],
    "authors": [
      "Marius-Constantin Dinu",
      "Claudiu Leoveanu-Condrei",
      "Markus Holzleitner",
      "Werner Zellinger",
      "Sepp Hochreiter"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Marius-Constantin Dinu; Claudiu Leoveanu-Condrei; Markus Holzleitner; Werner Zellinger; Sepp Hochreiter (2024). SymbolicAI: A framework for logic-based approaches combining generative models and solvers. https://arxiv.org/abs/2402.00854",
    "claims": [
      "Solver-augmented compositional workflows can improve explainability and task alignment."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Symbolic and generative components can be unified in practical computational graphs."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Defines SymbolicAI operations for compositional multi-modal workflows.",
      "Introduces VERTEX score for trajectory-aware quality evaluation.",
      "Presents benchmark workflows and links framework and benchmark code."
    ],
    "evidence_atoms": [
      {
        "atom_type": "equation",
        "confidence": "medium",
        "content": "\\mathcal{D}(\\mathbb{P}_{gen},\\mathbb{P}_{ref})",
        "locator": "Section 6",
        "provenance_snippet": "cumulative distance between generated and reference distributions",
        "source_ref": "https://doi.org/10.48550/arXiv.2402.00854"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [
      "Extend benchmark tasks and solver integrations across broader domains."
    ],
    "id": "src_b2199aad751f",
    "key_equations": [
      "\\mathcal{D}(\\mathbb{P}_{gen},\\mathbb{P}_{ref})"
    ],
    "limitations": [
      "Chunked stream processing can lose shared context across chunks."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Introduces SymbolicAI as a modular neuro-symbolic framework and VERTEX trajectory cross-similarity score for multi-step workflow evaluation.",
    "theorem_proof_scaffolds": [],
    "title": "SymbolicAI: A framework for logic-based approaches combining generative models and solvers",
    "url": "https://arxiv.org/abs/2402.00854",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Wang, Xingyao",
      "Chen, Yangyi",
      "Yuan, Lifan",
      "Zhang, Yizhe",
      "Li, Yunzhu",
      "Peng, Hao",
      "Ji, Heng"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Wang, Xingyao; Chen, Yangyi; Yuan, Lifan; Zhang, Yizhe; Li, Yunzhu; Peng, Hao; Ji, Heng (2024). Executable Code Actions Elicit Better LLM Agents. https://arxiv.org/abs/2402.01030",
    "claims": [
      "We present results in Tab."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "We present results in Tab."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We present results in Tab.",
        "locator": "abstract/full-text",
        "provenance_snippet": "We present results in Tab.",
        "source_ref": "https://arxiv.org/abs/2402.01030"
      }
    ],
    "extraction_completeness": "substantial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_a12ff844ce1b",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [
      "Method details extracted from arXiv abs page and ar5iv full text where available."
    ],
    "provenance_notes": [
      "arXiv abs page: https://arxiv.org/abs/2402.01030",
      "ar5iv full-text snapshot: knowledge/papers/arxiv_2402.01030.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract page for arXiv paper 2402.01030: Executable Code Actions Elicit Better LLM Agents",
    "theorem_proof_scaffolds": [],
    "title": "Executable Code Actions Elicit Better LLM Agents",
    "url": "https://arxiv.org/abs/2402.01030",
    "venue": "arXiv",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Jian Xie",
      "Kai Zhang",
      "Jiangjie Chen",
      "Tinghui Zhu",
      "Renze Lou",
      "Yuandong Tian",
      "Yanghua Xiao",
      "Yu Su"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Jian Xie; Kai Zhang; Jiangjie Chen; Tinghui Zhu; Renze Lou; Yuandong Tian; Yanghua Xiao; Yu Su (2024). TravelPlanner: A Benchmark for Real-World Planning with Language Agents. https://arxiv.org/abs/2402.01622",
    "claims": [
      "Recently, language agents powered by large language models (LLMs) have shown interesting capabilities such as tool use and reasoning.",
      "Comprehensive evaluations show that the current language agents are not yet capable of handling such complex planning tasks-even GPT-4 only achieves a success rate of 0.6%."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Recently, language agents powered by large language models (LLMs) have shown interesting capabilities such as tool use and reasoning.",
      "Comprehensive evaluations show that the current language agents are not yet capable of handling such complex planning tasks-even GPT-4 only achieves a success rate of 0.6%."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To advance this investigation, we propose TravelPlanner, a new planning benchmark that focuses on travel planning, a common real-world planning scenario."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To advance this investigation, we propose TravelPlanner, a new planning benchmark that focuses on travel planning, a common real-world planning scenario.",
        "locator": "abstract",
        "provenance_snippet": "To advance this investigation, we propose TravelPlanner, a new planning benchmark that focuses on travel planning, a common real-world planning scenario.",
        "source_ref": "https://doi.org/10.48550/arxiv.2402.01622"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "To advance this investigation, we propose TravelPlanner, a new planning benchmark that focuses on travel planning, a common real-world planning scenario.",
        "locator": "abstract",
        "provenance_snippet": "To advance this investigation, we propose TravelPlanner, a new planning benchmark that focuses on travel planning, a common real-world planning scenario.",
        "source_ref": "https://doi.org/10.48550/arxiv.2402.01622"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_01fdaa0fb2ad",
    "key_equations": [],
    "limitations": [
      "However, we note that the mere possibility for language agents to tackle such a complex problem is in itself non-trivial progress."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Planning has been part of the core pursuit for artificial intelligence since its conception, but earlier AI agents mostly focused on constrained settings because many of the cognitive substrates necessary for human-level planning have been lacking. Recently, language agents powered by large language models (LLMs) have shown interesting capabilities such as tool use and reasoning.",
    "theorem_proof_scaffolds": [],
    "title": "TravelPlanner: A Benchmark for Real-World Planning with Language Agents",
    "url": "https://arxiv.org/abs/2402.01622",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Pei Zhou",
      "Jay Pujara",
      "Xiang Ren",
      "Xinyun Chen",
      "Heng-Tze Cheng",
      "Quoc V. Le",
      "Ed H.",
      "Denny Zhou",
      "Swaroop Mishra",
      "Huaixiu Zheng"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Pei Zhou; Jay Pujara; Xiang Ren; Xinyun Chen; Heng-Tze Cheng; Quoc V. Le; Ed H.; Denny Zhou; Swaroop Mishra; Huaixiu Zheng (2024). Self-Discover: Large Language Models Self-Compose Reasoning Structures. https://arxiv.org/abs/2402.03620",
    "claims": [
      "SELF-DISCOVER substantially improves GPT-4 and PaLM 2's performance on challenging reasoning benchmarks such as BigBench-Hard, grounded agent reasoning, and MATH, by as much as 32% compared to Chain of Thought (CoT).",
      "Furthermore, SELF-DISCOVER outperforms inference-intensive methods such as CoT-Self-Consistency by more than 20%, while requiring 10-40x fewer inference compute.",
      "Finally, we show that the self-discovered reasoning structures are universally applicable across model families: from PaLM 2-L to GPT-4, and from GPT-4 to Llama2, and share commonalities with human reasoning patterns."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "SELF-DISCOVER substantially improves GPT-4 and PaLM 2's performance on challenging reasoning benchmarks such as BigBench-Hard, grounded agent reasoning, and MATH, by as much as 32% compared to Chain of Thought (CoT).",
      "Furthermore, SELF-DISCOVER outperforms inference-intensive methods such as CoT-Self-Consistency by more than 20%, while requiring 10-40x fewer inference compute."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the task-intrinsic reasoning structures to tackle complex reasoning problems that are challenging for typical prompting methods."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the task-intrinsic reasoning structures to tackle complex reasoning problems that are challenging for typical prompting methods.",
        "locator": "abstract",
        "provenance_snippet": "We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the task-intrinsic reasoning structures to tackle complex reasoning problems that are challenging for typi",
        "source_ref": "https://doi.org/10.48550/arxiv.2402.03620"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "SELF-DISCOVER substantially improves GPT-4 and PaLM 2's performance on challenging reasoning benchmarks such as BigBench-Hard, grounded agent reasoning, and MATH, by as much as 32% compared to Chain of Thought (CoT).",
        "locator": "abstract",
        "provenance_snippet": "SELF-DISCOVER substantially improves GPT-4 and PaLM 2's performance on challenging reasoning benchmarks such as BigBench-Hard, grounded agent reasoning, and MATH, by as much as 32%",
        "source_ref": "https://doi.org/10.48550/arxiv.2402.03620"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_2f97a367b96e",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the task-intrinsic reasoning structures to tackle complex reasoning problems that are challenging for typical prompting methods. Core to the framework is a self-discovery process where LLMs select multiple atomic reasoning modules such as critical thinking and step-by-step thinking, and compose them into an explicit reasoning structure for LLMs to follow during decoding.",
    "theorem_proof_scaffolds": [],
    "title": "Self-Discover: Large Language Models Self-Compose Reasoning Structures",
    "url": "https://arxiv.org/abs/2402.03620",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Hong, Sirui",
      "Lin, Yizhang",
      "Liu, Bang",
      "Liu, Bangbang",
      "Wu, Binhao",
      "Zhang, Ceyao",
      "Wei, Chenxing",
      "Li, Danyang",
      "Chen, Jiaqi",
      "Zhang, Jiayi",
      "Wang, Jinlin",
      "Zhang, Li",
      "Zhang, Lingyao",
      "Yang, Min",
      "Zhuge, Mingchen",
      "Guo, Taicheng",
      "Zhou, Tuo",
      "Tao, Wei",
      "Tang, Xiangru",
      "Lu, Xiangtao",
      "Zheng, Xiawu",
      "Liang, Xinbing",
      "Fei, Yaying",
      "Cheng, Yuheng",
      "Gou, Zhibin",
      "Xu, Zongze",
      "Wu, Chenglin"
    ],
    "baseline_details": [],
    "citation": "Hong, Sirui; Lin, Yizhang; Liu, Bang; Liu, Bangbang; Wu, Binhao; Zhang, Ceyao; Wei, Chenxing; Li, Danyang; Chen, Jiaqi; Zhang, Jiayi; Wang, Jinlin; Zhang, Li; Zhang, Lingyao; Yang, Min; Zhuge, Mingchen; Guo, Taicheng; Zhou, Tuo; Tao, Wei; Tang, Xiangru; Lu, Xiangtao; Zheng, Xiawu; Liang, Xinbing; Fei, Yaying; Cheng, Yuheng; Gou, Zhibin; Xu, Zongze; Wu, Chenglin (2024). Data Interpreter: An LLM Agent For Data Science. https://arxiv.org/abs/2402.18679",
    "claims": [
      "Large Language Model (LLM)-based agents have shown effectiveness across many applications. However, their use in data science scenarios requiring solving long-term interconnected tasks, dynamic data adjustments and domain expertise remains challenging. Previous approaches primarily focus on individual tasks, making it difficult to assess the complete data science workflow. Moreover, they struggle to handle real-time changes in intermediate data and fail to adapt dynamically to evolving task dependencies inherent to data science problems. In this paper, we present Data Interpreter, an LLM-based agent designed to automatically solve various data science problems end-to-end. Our Data Interpreter incorporates two key modules: 1) Hierarchical Graph Modeling, which breaks down complex problems into manageable subproblems, enabling dynamic node generation and graph optimization; and 2) Programmable Node Generation, a technique that refines and verifies each subproblem to iteratively improve code generation results and robustness. Extensive experiments consistently demonstrate the superiority of Data Interpreter. On InfiAgent-DABench, it achieves a 25% performance boost, raising accuracy from 75.9% to 94.9%. For machine learning and open-ended tasks, it improves performance from 88% to 95%, and from 60% to 97%, respectively. Moreover, on the MATH dataset, Data Interpreter achieves remarkable performance with a 26% improvement compared to state-of-the-art baselines. The code is available at https://github.com/geekan/MetaGPT."
    ],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Large Language Model (LLM)-based agents have shown effectiveness across many applications. However, their use in data science scenarios requiring solving long-term interconnected tasks, dynamic data adjustments and domain expertise remains challenging. Previous approaches primarily focus on individual tasks, making it difficult to assess the complete data science workflow. Moreover, they struggle to handle real-time changes in intermediate data and fail to adapt dynamically to evolving task dependencies inherent to data science problems. In this paper, we present Data Interpreter, an LLM-based agent designed to automatically solve various data science problems end-to-end. Our Data Interpreter incorporates two key modules: 1) Hierarchical Graph Modeling, which breaks down complex problems into manageable subproblems, enabling dynamic node generation and graph optimization; and 2) Programmable Node Generation, a technique that refines and verifies each subproblem to iteratively improve code generation results and robustness. Extensive experiments consistently demonstrate the superiority of Data Interpreter. On InfiAgent-DABench, it achieves a 25% performance boost, raising accuracy from 75.9% to 94.9%. For machine learning and open-ended tasks, it improves performance from 88% to 95%, and from 60% to 97%, respectively. Moreover, on the MATH dataset, Data Interpreter achieves remarkable performance with a 26% improvement compared to state-of-the-art baselines. The code is available at https://github.com/geekan/MetaGPT."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large Language Model (LLM)-based agents have shown effectiveness across many applications. However, their use in data science scenarios requiring solving long-term interconnected tasks, dynamic data adjustments and domain expertise remains challenging. Previous approaches primarily focus on individual tasks, making it difficult to assess the complete data science workflow. Moreover, they struggle to handle real-time ",
        "locator": "abstract",
        "provenance_snippet": "Large Language Model (LLM)-based agents have shown effectiveness across many applications. However, their use in data science scenarios requiring solving long-term interconnected tasks, dynamic data adjustments and domai",
        "source_ref": "https://arxiv.org/abs/2402.18679"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_9288c207bf56",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [
      "full_text_path: knowledge/papers/2402.18679_full.html"
    ],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Model (LLM)-based agents have shown effectiveness across many applications. However, their use in data science scenarios requiring solving long-term interconnected tasks, dynamic data adjustments and domain expertise remains challenging. Previous approaches primarily focus on individual tasks, making it difficult to assess the complete data science workflow. Moreover, they struggle to handle real-time changes in intermediate data and fail to adapt dynamically to evolving task dependencies inherent to data science problems. In this paper, we present Data Interpreter, an LLM-based agent designed to automatically solve various data science problems end-to-end. Our Data Interpreter incorporates two key modules: 1) Hierarchical Graph Modeling, which breaks down complex problems into manageable subproblems, enabling dynamic node generation and graph optimization; and 2) Programmable Node Generation, a technique that refines and verifies each subproblem to iteratively improve code generation results and robustness. Extensive experiments consistently demonstrate the superiority of Data Interpreter. On InfiAgent-DABench, it achieves a 25% performance boost, raising accuracy from 75.9% to 94.9%. For machine learning and open-ended tasks, it improves performance from 88% to 95%, and from 60% to 97%, respectively. Moreover, on the MATH dataset, Data Interpreter achieves remarkable performance with a 26% improvement compared to state-of-the-art baselines. The code is available at https://github.com/geekan/MetaGPT.",
    "theorem_proof_scaffolds": [],
    "title": "Data Interpreter: An LLM Agent For Data Science",
    "url": "https://arxiv.org/abs/2402.18679",
    "venue": "arXiv",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Zixian Ma",
      "Weikai Huang",
      "Jieyu Zhang",
      "Tanmay Gupta",
      "Ranjay Krishna"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Zixian Ma; Weikai Huang; Jieyu Zhang; Tanmay Gupta; Ranjay Krishna (2024). m&amp;m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks. https://arxiv.org/abs/2403.11085",
    "claims": [
      "Does feedback improve planning?"
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Does feedback improve planning?"
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To answer these questions and more, we introduce m&amp;m's: a benchmark containing 4K+ multi-step multi-modal tasks involving 33 tools that include multi-modal models, (free) public APIs, and image processing modules."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To answer these questions and more, we introduce m&amp;m's: a benchmark containing 4K+ multi-step multi-modal tasks involving 33 tools that include multi-modal models, (free) public APIs, and image processing modules.",
        "locator": "abstract",
        "provenance_snippet": "To answer these questions and more, we introduce m&amp;m's: a benchmark containing 4K+ multi-step multi-modal tasks involving 33 tools that include multi-modal models, (free) publi",
        "source_ref": "https://doi.org/10.48550/arxiv.2403.11085"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "However, the lack of standardized benchmarks for evaluating LLMs as planners for multi-step multi-modal tasks has prevented a systematic study of planner design decisions.",
        "locator": "abstract",
        "provenance_snippet": "However, the lack of standardized benchmarks for evaluating LLMs as planners for multi-step multi-modal tasks has prevented a systematic study of planner design decisions.",
        "source_ref": "https://doi.org/10.48550/arxiv.2403.11085"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_4483461cffa2",
    "key_equations": [],
    "limitations": [
      "However, the lack of standardized benchmarks for evaluating LLMs as planners for multi-step multi-modal tasks has prevented a systematic study of planner design decisions."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Real-world multi-modal problems are rarely solved by a single machine learning model, and often require multi-step computational plans that involve stitching several models. Tool-augmented LLMs hold tremendous promise for automating the generation of such computational plans.",
    "theorem_proof_scaffolds": [],
    "title": "m&amp;m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks",
    "url": "https://arxiv.org/abs/2403.11085",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Yixing Jiang",
      "Kameron Collin Black",
      "Gloria Geng",
      "Daniel J. Park",
      "Zou, James",
      "Andrew Y. Ng",
      "Jonathan Chen"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Yixing Jiang; Kameron Collin Black; Gloria Geng; Daniel J. Park; Zou, James; Andrew Y. Ng; Jonathan Chen (2025). MedAgentBench: A Realistic Virtual EHR Environment to Benchmark Medical LLM Agents. https://arxiv.org/abs/2501.14654",
    "claims": [
      "Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents thereby surpassing their traditional role as chatbots.",
      "However, there is still substantial space for improvement which gives the community a next direction to optimize.",
      "MedAgentBench establishes this and is publicly available at https://github.com/stanfordmlgroup/MedAgentBench , offering a valuable framework for model developers to track progress and drive continuous improvements in the agent capabilities of large language models within the medical domain."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents thereby surpassing their traditional role as chatbots.",
      "However, there is still substantial space for improvement which gives the community a next direction to optimize."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To address this gap, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of large language models within medical records contexts."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To address this gap, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of large language models within medical records contexts.",
        "locator": "abstract",
        "provenance_snippet": "To address this gap, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of large language models within medical records contexts.",
        "source_ref": "https://doi.org/10.48550/arxiv.2501.14654"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making the evaluation of LLMs on complex tasks in interactive healthcare environments challenging.",
        "locator": "abstract",
        "provenance_snippet": "However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making the evaluation of LLMs on complex tasks in interact",
        "source_ref": "https://doi.org/10.48550/arxiv.2501.14654"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_139ff817eb0e",
    "key_equations": [],
    "limitations": [
      "However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making the evaluation of LLMs on complex tasks in interactive healthcare environments challenging.",
      "However, there is still substantial space for improvement which gives the community a next direction to optimize."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents thereby surpassing their traditional role as chatbots. These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level.",
    "theorem_proof_scaffolds": [],
    "title": "MedAgentBench: A Realistic Virtual EHR Environment to Benchmark Medical LLM Agents",
    "url": "https://arxiv.org/abs/2501.14654",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Avinash Patil",
      "Jadon, Aryan"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Avinash Patil; Jadon, Aryan (2025). Advancing Reasoning in Large Language Models: Promising Methods and Approaches. https://arxiv.org/abs/2502.03671",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Large Language Models (LLMs) have succeeded remarkably in various natural language processing (NLP) tasks, yet their reasoning capabilities remain a fundamental challenge."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large Language Models (LLMs) have succeeded remarkably in various natural language processing (NLP) tasks, yet their reasoning capabilities remain a fundamental challenge.",
        "locator": "abstract",
        "provenance_snippet": "Large Language Models (LLMs) have succeeded remarkably in various natural language processing (NLP) tasks, yet their reasoning capabilities remain a fundamental challenge.",
        "source_ref": "https://doi.org/10.48550/arxiv.2502.03671"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_932349758ddc",
    "key_equations": [],
    "limitations": [
      "Large Language Models (LLMs) have succeeded remarkably in various natural language processing (NLP) tasks, yet their reasoning capabilities remain a fundamental challenge.",
      "Additionally, we explore evaluation frameworks used to assess reasoning in LLMs and highlight open challenges, such as hallucinations, robustness, and reasoning generalization across diverse tasks."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Models (LLMs) have succeeded remarkably in various natural language processing (NLP) tasks, yet their reasoning capabilities remain a fundamental challenge. While LLMs exhibit impressive fluency and factual recall, their ability to perform complex reasoning-spanning logical deduction, mathematical problem-solving, commonsense inference, and multi-step reasoning-often falls short of human expectations.",
    "theorem_proof_scaffolds": [],
    "title": "Advancing Reasoning in Large Language Models: Promising Methods and Approaches",
    "url": "https://arxiv.org/abs/2502.03671",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Hanmeng Liu",
      "Zhizhang Fu",
      "Mengru Ding",
      "Ruoxi Ning",
      "Chaoli Zhang",
      "Xiaozhang Liu",
      "Yue Zhang"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Hanmeng Liu; Zhizhang Fu; Mengru Ding; Ruoxi Ning; Chaoli Zhang; Xiaozhang Liu; Yue Zhang (2025). Logical Reasoning in Large Language Models: A Survey. https://arxiv.org/abs/2502.09100",
    "claims": [
      "With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities.",
        "locator": "abstract",
        "provenance_snippet": "With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities.",
        "source_ref": "https://doi.org/10.48550/arxiv.2502.09100"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "It outlines the scope of logical reasoning in LLMs, its theoretical foundations, and the benchmarks used to evaluate reasoning proficiency.",
        "locator": "abstract",
        "provenance_snippet": "It outlines the scope of logical reasoning in LLMs, its theoretical foundations, and the benchmarks used to evaluate reasoning proficiency.",
        "source_ref": "https://doi.org/10.48550/arxiv.2502.09100"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_865b0aa37ef4",
    "key_equations": [],
    "limitations": [
      "However, their ability to perform rigorous logical reasoning remains an open question."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "With the emergence of advanced reasoning models like OpenAI o3 and DeepSeek-R1, large language models (LLMs) have demonstrated remarkable reasoning capabilities. However, their ability to perform rigorous logical reasoning remains an open question.",
    "theorem_proof_scaffolds": [],
    "title": "Logical Reasoning in Large Language Models: A Survey",
    "url": "https://arxiv.org/abs/2502.09100",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Bo Lin",
      "Shangwen Wang",
      "Yihao Qin",
      "Liqian Chen",
      "Xiaoguang Mao"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Bo Lin; Shangwen Wang; Yihao Qin; Liqian Chen; Xiaoguang Mao (2025). Large Language Models-Aided Program Debloating. https://arxiv.org/abs/2503.08969",
    "claims": [
      "Extensive evaluations on widely used benchmarks demonstrate the efficacy of LEADER.",
      "These results demonstrate that LEADER surpasses the state-of-the-art tool CovA in functionality and security."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Extensive evaluations on widely used benchmarks demonstrate the efficacy of LEADER.",
      "These results demonstrate that LEADER surpasses the state-of-the-art tool CovA in functionality and security."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To address these limitations, we propose LEADER, a program debloating framework enhanced by Large Language Models (LLMs), which leverages their semantic understanding, generative capabilities, and decision-making strengths."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To address these limitations, we propose LEADER, a program debloating framework enhanced by Large Language Models (LLMs), which leverages their semantic understanding, generative capabilities, and decision-making strengths.",
        "locator": "abstract",
        "provenance_snippet": "To address these limitations, we propose LEADER, a program debloating framework enhanced by Large Language Models (LLMs), which leverages their semantic understanding, generative c",
        "source_ref": "https://doi.org/10.48550/arxiv.2503.08969"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Extensive evaluations on widely used benchmarks demonstrate the efficacy of LEADER.",
        "locator": "abstract",
        "provenance_snippet": "Extensive evaluations on widely used benchmarks demonstrate the efficacy of LEADER.",
        "source_ref": "https://doi.org/10.48550/arxiv.2503.08969"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_28680141c40f",
    "key_equations": [],
    "limitations": [
      "As software grows in complexity to accommodate diverse features and platforms, software bloating has emerged as a significant challenge, adversely affecting performance and security.",
      "However, existing approaches inadequately address the dual objectives of debloating: maintaining functionality by preserving essential features and enhancing security by reducing security issues.",
      "However, these approaches frequently overfit provided inputs, leading to functionality loss and potential security vulnerabilities."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "As software grows in complexity to accommodate diverse features and platforms, software bloating has emerged as a significant challenge, adversely affecting performance and security. However, existing approaches inadequately address the dual objectives of debloating: maintaining functionality by preserving essential features and enhancing security by reducing security issues.",
    "theorem_proof_scaffolds": [],
    "title": "Large Language Models-Aided Program Debloating",
    "url": "https://arxiv.org/abs/2503.08969",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Asaf Yehudai",
      "Lilach Eden",
      "Alan Li",
      "Guy Uziel",
      "Yilun Zhao",
      "Roy Bar-Haim",
      "Arman Cohan",
      "Michal Shmueli-Scheuer"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Asaf Yehudai; Lilach Eden; Alan Li; Guy Uziel; Yilun Zhao; Roy Bar-Haim; Arman Cohan; Michal Shmueli-Scheuer (2025). Survey on Evaluation of LLM-based Agents. https://arxiv.org/abs/2503.16416",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "This paper provides the first comprehensive survey of evaluation methodologies for these increasingly capable agents."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "This paper provides the first comprehensive survey of evaluation methodologies for these increasingly capable agents.",
        "locator": "abstract",
        "provenance_snippet": "This paper provides the first comprehensive survey of evaluation methodologies for these increasingly capable agents.",
        "source_ref": "https://doi.org/10.48550/arxiv.2503.16416"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "We systematically analyze evaluation benchmarks and frameworks across four critical dimensions: (1) fundamental agent capabilities, including planning, tool use, self-reflection, and memory; (2) application-specific benchmarks for web, software engineering, scientific, and conversational agents; (3) benchmarks for generalist agents; and (4) frameworks for evaluating agents.",
        "locator": "abstract",
        "provenance_snippet": "We systematically analyze evaluation benchmarks and frameworks across four critical dimensions: (1) fundamental agent capabilities, including planning, tool use, self-reflection, a",
        "source_ref": "https://doi.org/10.48550/arxiv.2503.16416"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_2dddcffecec1",
    "key_equations": [],
    "limitations": [
      "This survey maps the rapidly evolving landscape of agent evaluation, reveals the emerging trends in the field, identifies current limitations, and proposes directions for future research."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "The emergence of LLM-based agents represents a paradigm shift in AI, enabling autonomous systems to plan, reason, use tools, and maintain memory while interacting with dynamic environments. This paper provides the first comprehensive survey of evaluation methodologies for these increasingly capable agents.",
    "theorem_proof_scaffolds": [],
    "title": "Survey on Evaluation of LLM-based Agents",
    "url": "https://arxiv.org/abs/2503.16416",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Jindong Zhang",
      "Jialong Zhou",
      "Chuang Liu"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Jindong Zhang; Jialong Zhou; Chuang Liu (2025). OR-Toolformer: Modeling and Solving Operations Research Problems with Tool Augmented Large Language Models. https://arxiv.org/abs/2510.01253",
    "claims": [
      "Large language models (LLMs) demonstrate strong mathematical reasoning, but reliance on closed-source APIs for OR tasks raises privacy concerns, and training open-source models from scratch incurs high compute costs.",
      "In zero-shot evaluation on two unseen OR problem types, it attains 54% average accuracy, a 21 percentage-point improvement over the strongest baseline."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Large language models (LLMs) demonstrate strong mathematical reasoning, but reliance on closed-source APIs for OR tasks raises privacy concerns, and training open-source models from scratch incurs high compute costs.",
      "In zero-shot evaluation on two unseen OR problem types, it attains 54% average accuracy, a 21 percentage-point improvement over the strongest baseline."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "We introduce OR-Toolformer, which fine-tunes Llama-3.1-8B-Instruct with a semi-automatic data synthesis pipeline that generates diverse OR problem-answer pairs and augments the model with external solvers to produce API calls."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We introduce OR-Toolformer, which fine-tunes Llama-3.1-8B-Instruct with a semi-automatic data synthesis pipeline that generates diverse OR problem-answer pairs and augments the model with external solvers to produce API calls.",
        "locator": "abstract",
        "provenance_snippet": "We introduce OR-Toolformer, which fine-tunes Llama-3.1-8B-Instruct with a semi-automatic data synthesis pipeline that generates diverse OR problem-answer pairs and augments the mod",
        "source_ref": "https://doi.org/10.48550/arxiv.2510.01253"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "On three of four standard benchmarks, OR-Toolformer achieves up to 80.1% execution accuracy, exceeding size-matched baselines by over 4.3%.",
        "locator": "abstract",
        "provenance_snippet": "On three of four standard benchmarks, OR-Toolformer achieves up to 80.1% execution accuracy, exceeding size-matched baselines by over 4.3%.",
        "source_ref": "https://doi.org/10.48550/arxiv.2510.01253"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_4a4f020dcb77",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large language models (LLMs) demonstrate strong mathematical reasoning, but reliance on closed-source APIs for OR tasks raises privacy concerns, and training open-source models from scratch incurs high compute costs. We introduce OR-Toolformer, which fine-tunes Llama-3.1-8B-Instruct with a semi-automatic data synthesis pipeline that generates diverse OR problem-answer pairs and augments the model with external solvers to produce API calls.",
    "theorem_proof_scaffolds": [],
    "title": "OR-Toolformer: Modeling and Solving Operations Research Problems with Tool Augmented Large Language Models",
    "url": "https://arxiv.org/abs/2510.01253",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Daniel Cunnington",
      "Mark Law",
      "Jorge M. Lobo",
      "Alessandra Russo"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Daniel Cunnington; Mark Law; Jorge M. Lobo; Alessandra Russo (2024). The Role of Foundation Models in Neuro-Symbolic Learning and Reasoning. https://doi.org/10.1007/978-3-031-71167-1_5",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "The Role of Foundation Models in Neuro-Symbolic Learning and Reasoning studies LLM reasoning/agent workflows.",
        "locator": null,
        "provenance_snippet": null,
        "source_ref": "https://doi.org/10.1007/978-3-031-71167-1_5"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_a2e0d276b567",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "The Role of Foundation Models in Neuro-Symbolic Learning and Reasoning studies LLM reasoning/agent workflows.",
    "theorem_proof_scaffolds": [],
    "title": "The Role of Foundation Models in Neuro-Symbolic Learning and Reasoning",
    "url": "https://doi.org/10.1007/978-3-031-71167-1_5",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Sadia Sultana Chowa",
      "Riasad Alvi",
      "S M Asif Ur Rahman",
      "Md Abdur Rahman",
      "Mohaimenul Azam Khan Raiaan",
      "Md Rafiqul Islam",
      "Mukhtar Hussain",
      "Sami Azam"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Sadia Sultana Chowa; Riasad Alvi; S M Asif Ur Rahman; Md Abdur Rahman; Mohaimenul Azam Khan Raiaan; Md Rafiqul Islam; Mukhtar Hussain; Sami Azam (2026). From language to action: a review of large language models as autonomous agents and tool users. https://doi.org/10.1007/s10462-025-11471-9",
    "claims": [
      "In conducting this review, we have identified critical findings on verifiable reasoning of LLMs, the capacity for self-improvement, and the personalization of LLM-based agents."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "In conducting this review, we have identified critical findings on verifiable reasoning of LLMs, the capacity for self-improvement, and the personalization of LLM-based agents."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "The pursuit of human-level artificial intelligence (AI) has significantly advanced the development of autonomous agents and Large Language Models (LLMs)."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "The pursuit of human-level artificial intelligence (AI) has significantly advanced the development of autonomous agents and Large Language Models (LLMs).",
        "locator": "abstract",
        "provenance_snippet": "The pursuit of human-level artificial intelligence (AI) has significantly advanced the development of autonomous agents and Large Language Models (LLMs).",
        "source_ref": "https://doi.org/10.1007/s10462-025-11471-9"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Furthermore, we have evaluated current benchmarks and assessment protocols and provided an analysis of 68 publicly available datasets to assess the performance of LLM-based agents in various tasks.",
        "locator": "abstract",
        "provenance_snippet": "Furthermore, we have evaluated current benchmarks and assessment protocols and provided an analysis of 68 publicly available datasets to assess the performance of LLM-based agents ",
        "source_ref": "https://doi.org/10.1007/s10462-025-11471-9"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_a9aaad226309",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "The pursuit of human-level artificial intelligence (AI) has significantly advanced the development of autonomous agents and Large Language Models (LLMs). LLMs are now widely utilized as decision-making agents for their ability to interpret instructions, manage sequential tasks, and adapt through feedback.",
    "theorem_proof_scaffolds": [],
    "title": "From language to action: a review of large language models as autonomous agents and tool users",
    "url": "https://doi.org/10.1007/s10462-025-11471-9",
    "venue": "",
    "year": 2026
  },
  {
    "assumptions": [],
    "authors": [
      "Jing Chen",
      "Zheng Liu",
      "Xu Huang",
      "Chenwang Wu",
      "Qi Liu",
      "Gangwei Jiang",
      "Yuanhao Pu",
      "Yuxuan Lei",
      "Xiaolong Chen",
      "Xingmei Wang",
      "Kai Zheng",
      "Defu Lian",
      "Enhong Chen"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Jing Chen; Zheng Liu; Xu Huang; Chenwang Wu; Qi Liu; Gangwei Jiang; Yuanhao Pu; Yuxuan Lei; Xiaolong Chen; Xingmei Wang; Kai Zheng; Defu Lian; Enhong Chen (2024). When large language models meet personalization: perspectives of challenges and opportunities. https://doi.org/10.1007/s11280-024-01276-1",
    "claims": [
      "With the unprecedented scale of training and model parameters, the capability of large language models has been dramatically improved, leading to human-like performances in understanding, language synthesizing, common-sense reasoning, etc."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "With the unprecedented scale of training and model parameters, the capability of large language models has been dramatically improved, leading to human-like performances in understanding, language synthesizing, common-sense reasoning, etc."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Abstract The advent of large language models marks a revolutionary breakthrough in artificial intelligence."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Abstract The advent of large language models marks a revolutionary breakthrough in artificial intelligence.",
        "locator": "abstract",
        "provenance_snippet": "Abstract The advent of large language models marks a revolutionary breakthrough in artificial intelligence.",
        "source_ref": "https://doi.org/10.1007/s11280-024-01276-1"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_182f1f480a7d",
    "key_equations": [],
    "limitations": [
      "Therefore, we consider it to be right the time to review the challenges in personalization and the opportunities to address them with large language models.",
      "In particular, we dedicate this perspective paper to the discussion of the following aspects: the development and challenges for the existing personalization system, the newly emerged capabilities of large language models, and the potential ways of making use of large language models for personalization."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract The advent of large language models marks a revolutionary breakthrough in artificial intelligence. With the unprecedented scale of training and model parameters, the capability of large language models has been dramatically improved, leading to human-like performances in understanding, language synthesizing, common-sense reasoning, etc.",
    "theorem_proof_scaffolds": [],
    "title": "When large language models meet personalization: perspectives of challenges and opportunities",
    "url": "https://doi.org/10.1007/s11280-024-01276-1",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Weikai Xu",
      "Chengrui Huang",
      "Shen Gao",
      "Shuo Shang"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Weikai Xu; Chengrui Huang; Shen Gao; Shuo Shang (2025). LLM-Based Agents for Tool Learning: A Survey. https://doi.org/10.1007/s41019-025-00296-9",
    "claims": [
      "Recently, the large language model (LLM) has demonstrated immense potential across various fields with its unique planning and reasoning abilities."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Recently, the large language model (LLM) has demonstrated immense potential across various fields with its unique planning and reasoning abilities."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To this end, we present a systematic investigation and comprehensive review of tool-learning agents in this paper.",
      "Following that, we introduce the tool planning methods and organize these works by whether they rely on the model\u2019s inherent reasoning capabilities for planning or utilize external reasoning tools.",
      "Next, we introduce several application scenarios for the LLM-based tool learning methods."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To this end, we present a systematic investigation and comprehensive review of tool-learning agents in this paper.",
        "locator": "abstract",
        "provenance_snippet": "To this end, we present a systematic investigation and comprehensive review of tool-learning agents in this paper.",
        "source_ref": "https://doi.org/10.1007/s41019-025-00296-9"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "In addition, we compile current open-source benchmarks and evaluation metrics, focusing on their scale, composition, calculation methods, and assessment dimensions.",
        "locator": "abstract",
        "provenance_snippet": "In addition, we compile current open-source benchmarks and evaluation metrics, focusing on their scale, composition, calculation methods, and assessment dimensions.",
        "source_ref": "https://doi.org/10.1007/s41019-025-00296-9"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_a572df70e757",
    "key_equations": [],
    "limitations": [
      "However, there are still many challenges beyond its capabilities due to deficiencies in its training data and inherent illusions."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Abstract Human beings capable of making and using tools can accomplish tasks far beyond their innate abilities, and this paradigm of integration with tools may not be limited to humans themselves. Recently, the large language model (LLM) has demonstrated immense potential across various fields with its unique planning and reasoning abilities.",
    "theorem_proof_scaffolds": [],
    "title": "LLM-Based Agents for Tool Learning: A Survey",
    "url": "https://doi.org/10.1007/s41019-025-00296-9",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "B.\u2010C. CHEN",
      "Zhaofeng Zhang",
      "Nicolas Langren\u00e9",
      "Shengxin Zhu"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "B.\u2010C. CHEN; Zhaofeng Zhang; Nicolas Langren\u00e9; Shengxin Zhu (2025). Unleashing the potential of prompt engineering for large language models. https://doi.org/10.1016/j.patter.2025.101260",
    "claims": [
      "Strategies for minimizing these risks and improving the robustness of models are thoroughly reviewed."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Strategies for minimizing these risks and improving the robustness of models are thoroughly reviewed."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Both foundational and advanced prompt engineering methodologies-including techniques such as self-consistency, chain of thought, and generated knowledge, which can significantly enhance the performance of models-are explored in this paper."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Both foundational and advanced prompt engineering methodologies-including techniques such as self-consistency, chain of thought, and generated knowledge, which can significantly enhance the performance of models-are explored in this paper.",
        "locator": "abstract",
        "provenance_snippet": "Both foundational and advanced prompt engineering methodologies-including techniques such as self-consistency, chain of thought, and generated knowledge, which can significantly en",
        "source_ref": "https://doi.org/10.1016/j.patter.2025.101260"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Prompt methods are evaluated with subjective and objective metrics, ensuring a robust analysis of their efficacy.",
        "locator": "abstract",
        "provenance_snippet": "Prompt methods are evaluated with subjective and objective metrics, ensuring a robust analysis of their efficacy.",
        "source_ref": "https://doi.org/10.1016/j.patter.2025.101260"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_d15b7c36e760",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "This review explores the role of prompt engineering in unleashing the capabilities of large language models (LLMs). Prompt engineering is the process of structuring inputs, and it has emerged as a crucial technique for maximizing the utility and accuracy of these models.",
    "theorem_proof_scaffolds": [],
    "title": "Unleashing the potential of prompt engineering for large language models",
    "url": "https://doi.org/10.1016/j.patter.2025.101260",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Diogo Santos",
      "F.A.M.M. Gon\u00e7alves",
      "G.C. Reis",
      "Miguel Borges Santos",
      "Miguel Saraiva",
      "Pedro Filipe Nunes Dur\u00e3es",
      "Marisa Maximiano",
      "Ricardo Gomes",
      "V\u00edtor T\u00e1vora",
      "Orlando Rem\u00e9dios"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Diogo Santos; F.A.M.M. Gon\u00e7alves; G.C. Reis; Miguel Borges Santos; Miguel Saraiva; Pedro Filipe Nunes Dur\u00e3es; Marisa Maximiano; Ricardo Gomes; V\u00edtor T\u00e1vora; Orlando Rem\u00e9dios (2025). Using LLMs to bridge the gap between consumers and Blockchain on a Agro-food traceability platform: an architectural proposal. https://doi.org/10.1016/j.procs.2025.02.126",
    "claims": [
      "Our primary objective is to propose an innovative architectural approach that addresses the challenges of implementing comprehensive traceability systems while improving consumer engagement."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Our primary objective is to propose an innovative architectural approach that addresses the challenges of implementing comprehensive traceability systems while improving consumer engagement."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "This paper explores architectural options for enhancing farm-to-fork traceability in the agro-food industry through the integration of blockchain, Web technologies, and Large Language Models (LLMs)."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "This paper explores architectural options for enhancing farm-to-fork traceability in the agro-food industry through the integration of blockchain, Web technologies, and Large Language Models (LLMs).",
        "locator": "abstract",
        "provenance_snippet": "This paper explores architectural options for enhancing farm-to-fork traceability in the agro-food industry through the integration of blockchain, Web technologies, and Large Langu",
        "source_ref": "https://doi.org/10.1016/j.procs.2025.02.126"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_9f86e6e12c0c",
    "key_equations": [],
    "limitations": [
      "Our primary objective is to propose an innovative architectural approach that addresses the challenges of implementing comprehensive traceability systems while improving consumer engagement."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "This paper explores architectural options for enhancing farm-to-fork traceability in the agro-food industry through the integration of blockchain, Web technologies, and Large Language Models (LLMs). Our primary objective is to propose an innovative architectural approach that addresses the challenges of implementing comprehensive traceability systems while improving consumer engagement.",
    "theorem_proof_scaffolds": [],
    "title": "Using LLMs to bridge the gap between consumers and Blockchain on a Agro-food traceability platform: an architectural proposal",
    "url": "https://doi.org/10.1016/j.procs.2025.02.126",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Daya Guo",
      "Dejian Yang",
      "Haowei Zhang",
      "Junxiao Song",
      "Peiyi Wang",
      "Qihao Zhu",
      "Runxin Xu",
      "Ruoyu Zhang",
      "Shirong Ma",
      "Xiao Bi",
      "Xiaokang Zhang",
      "Xingkai Yu",
      "Yu Wu",
      "Zhenhua Wu",
      "Zhibin Gou",
      "Zhihong Shao",
      "Zhuoshu Li",
      "Ziyi Gao",
      "Aixin Liu",
      "Bing Xue",
      "Bingxuan Wang",
      "Bowen Wu",
      "Bei Feng",
      "Chengda Lu",
      "Chenggang Zhao",
      "Chengqi Deng",
      "Chong Ruan",
      "Damai Dai",
      "Deli Chen",
      "Dongjie Ji",
      "Erhang Li",
      "Fangyun Lin",
      "Fengze Dai",
      "Fuli Luo",
      "Guangbo Hao",
      "Guan-Ting Chen",
      "Guowei Li",
      "Hongjun Zhang",
      "Hanwei Xu",
      "Honghui Ding",
      "Huazuo Gao",
      "Hui Qu",
      "Hui Li",
      "Jianzhong Guo",
      "Jiashi Li",
      "Jingchang Chen",
      "Jingyang Yuan",
      "Jiagang Tu",
      "Junjie Qiu",
      "Junlong Li",
      "Jiali Cai",
      "Jiaqi Ni",
      "Jian Liang",
      "Jing Chen",
      "Kai Dong",
      "Kai Hu",
      "Kaichao You",
      "Kaige Gao",
      "Kang Guan",
      "Kexin Huang",
      "Kuai Yu",
      "Lean Wang",
      "Lecong Zhang",
      "Liang Zhao",
      "Litong Wang",
      "Liyue Zhang",
      "Lei Xu",
      "L. Xia",
      "Mingchuan Zhang",
      "Minghua Zhang",
      "Minghui Tang",
      "Mingxu Zhou",
      "Meng Li",
      "Miaojun Wang",
      "Mingming Li",
      "Ning Tian",
      "Panpan Huang",
      "Peng Zhang",
      "Qiancheng Wang",
      "Qinyu Chen",
      "Qiushi Du",
      "Ruiqi Ge",
      "Ruisong Zhang",
      "Rui\u2010Le Pan",
      "Runji Wang",
      "R. J. Chen",
      "Rong Jin",
      "Ruyi Chen",
      "Shanghao Lu",
      "Shangyan Zhou",
      "Shanhuang Chen",
      "Shengfeng Ye",
      "Shiyu Wang",
      "Shuiping Yu",
      "Shunfeng Zhou",
      "Shuting Pan",
      "Sansan Li",
      "Shuang Zhou",
      "Shaoqing Wu",
      "Tao Yun"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Daya Guo; Dejian Yang; Haowei Zhang; Junxiao Song; Peiyi Wang; Qihao Zhu; Runxin Xu; Ruoyu Zhang; Shirong Ma; Xiao Bi; Xiaokang Zhang; Xingkai Yu; Yu Wu; Zhenhua Wu; Zhibin Gou; Zhihong Shao; Zhuoshu Li; Ziyi Gao; Aixin Liu; Bing Xue; Bingxuan Wang; Bowen Wu; Bei Feng; Chengda Lu; Chenggang Zhao; Chengqi Deng; Chong Ruan; Damai Dai; Deli Chen; Dongjie Ji; Erhang Li; Fangyun Lin; Fengze Dai; Fuli Luo; Guangbo Hao; Guan-Ting Chen; Guowei Li; Hongjun Zhang; Hanwei Xu; Honghui Ding; Huazuo Gao; Hui Qu; Hui Li; Jianzhong Guo; Jiashi Li; Jingchang Chen; Jingyang Yuan; Jiagang Tu; Junjie Qiu; Junlong Li; Jiali Cai; Jiaqi Ni; Jian Liang; Jing Chen; Kai Dong; Kai Hu; Kaichao You; Kaige Gao; Kang Guan; Kexin Huang; Kuai Yu; Lean Wang; Lecong Zhang; Liang Zhao; Litong Wang; Liyue Zhang; Lei Xu; L. Xia; Mingchuan Zhang; Minghua Zhang; Minghui Tang; Mingxu Zhou; Meng Li; Miaojun Wang; Mingming Li; Ning Tian; Panpan Huang; Peng Zhang; Qiancheng Wang; Qinyu Chen; Qiushi Du; Ruiqi Ge; Ruisong Zhang; Rui\u2010Le Pan; Runji Wang; R. J. Chen; Rong Jin; Ruyi Chen; Shanghao Lu; Shangyan Zhou; Shanhuang Chen; Shengfeng Ye; Shiyu Wang; Shuiping Yu; Shunfeng Zhou; Shuting Pan; Sansan Li; Shuang Zhou; Shaoqing Wu; Tao Yun (2025). DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning. https://doi.org/10.1038/s41586-025-09422-z",
    "claims": [
      "Here we show that the reasoning abilities of LLMs can be incentivized through pure reinforcement learning (RL), obviating the need for human-labelled reasoning trajectories."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Here we show that the reasoning abilities of LLMs can be incentivized through pure reinforcement learning (RL), obviating the need for human-labelled reasoning trajectories."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "General reasoning represents a long-standing and formidable challenge in artificial intelligence (AI)."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "General reasoning represents a long-standing and formidable challenge in artificial intelligence (AI).",
        "locator": "abstract",
        "provenance_snippet": "General reasoning represents a long-standing and formidable challenge in artificial intelligence (AI).",
        "source_ref": "https://doi.org/10.1038/s41586-025-09422-z"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_288cc512741e",
    "key_equations": [],
    "limitations": [
      "General reasoning represents a long-standing and formidable challenge in artificial intelligence (AI).",
      "However, this success is heavily contingent on extensive human-annotated demonstrations and the capabilities of models are still insufficient for more complex problems."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "General reasoning represents a long-standing and formidable challenge in artificial intelligence (AI). Recent breakthroughs, exemplified by large language models (LLMs)<sup>1,2</sup> and chain-of-thought (CoT) prompting<sup>3</sup>, have achieved considerable success on foundational reasoning tasks.",
    "theorem_proof_scaffolds": [],
    "title": "DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning",
    "url": "https://doi.org/10.1038/s41586-025-09422-z",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Iman Azimi",
      "Meng Qi",
      "Wang Li",
      "Amir M. Rahmani",
      "Youlin Li"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Iman Azimi; Meng Qi; Wang Li; Amir M. Rahmani; Youlin Li (2025). Evaluation of LLMs accuracy and consistency in the registered dietitian exam through prompt engineering and knowledge retrieval. https://doi.org/10.1038/s41598-024-85003-w",
    "claims": [
      "Although state-of-the-art LLMs have shown superior performance in several conversational applications, evaluations within nutrition and diet applications are still insufficient.",
      "GPT-4o with CoT-SC prompting outperformed the other approaches, whereas Gemini 1.5 Pro with ZS recorded the highest consistency.",
      "For GPT-4o and Claude 3.5, CoT improved the accuracy, and CoT-SC improved both accuracy and consistency."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Although state-of-the-art LLMs have shown superior performance in several conversational applications, evaluations within nutrition and diet applications are still insufficient.",
      "GPT-4o with CoT-SC prompting outperformed the other approaches, whereas Gemini 1.5 Pro with ZS recorded the highest consistency."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "In this paper, we propose to employ the Registered Dietitian (RD) exam to conduct a standard and comprehensive evaluation of state-of-the-art LLMs, GPT-4o, Claude 3.5 Sonnet, and Gemini 1.5 Pro, assessing both accuracy and consistency in nutrition queries."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "In this paper, we propose to employ the Registered Dietitian (RD) exam to conduct a standard and comprehensive evaluation of state-of-the-art LLMs, GPT-4o, Claude 3.5 Sonnet, and Gemini 1.5 Pro, assessing both accuracy and consistency in nutrition queries.",
        "locator": "abstract",
        "provenance_snippet": "In this paper, we propose to employ the Registered Dietitian (RD) exam to conduct a standard and comprehensive evaluation of state-of-the-art LLMs, GPT-4o, Claude 3.5 Sonnet, and G",
        "source_ref": "https://doi.org/10.1038/s41598-024-85003-w"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_840da66ba21c",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large language models (LLMs) are fundamentally transforming human-facing applications in the health and well-being domains: boosting patient engagement, accelerating clinical decision-making, and facilitating medical education. Although state-of-the-art LLMs have shown superior performance in several conversational applications, evaluations within nutrition and diet applications are still insufficient.",
    "theorem_proof_scaffolds": [],
    "title": "Evaluation of LLMs accuracy and consistency in the registered dietitian exam through prompt engineering and knowledge retrieval",
    "url": "https://doi.org/10.1038/s41598-024-85003-w",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Chin Siang Ong",
      "Nicholas T. Obey",
      "Yanan Zheng",
      "Arman Cohan",
      "Eric B. Schneider"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Chin Siang Ong; Nicholas T. Obey; Yanan Zheng; Arman Cohan; Eric B. Schneider (2024). SurgeryLLM: a retrieval-augmented generation large language model framework for surgical decision support and workflow enhancement. https://doi.org/10.1038/s41746-024-01391-3",
    "claims": [
      "The successful incorporation of guideline-based information represents a substantial step toward enabling greater surgeon efficiency, improving patient safety, and optimizing surgical outcomes."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "The successful incorporation of guideline-based information represents a substantial step toward enabling greater surgeon efficiency, improving patient safety, and optimizing surgical outcomes."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "SurgeryLLM, a large language model framework using Retrieval Augmented Generation demonstrably incorporated domain-specific knowledge from current evidence-based surgical guidelines when presented with patient-specific data."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "SurgeryLLM, a large language model framework using Retrieval Augmented Generation demonstrably incorporated domain-specific knowledge from current evidence-based surgical guidelines when presented with patient-specific data.",
        "locator": "abstract",
        "provenance_snippet": "SurgeryLLM, a large language model framework using Retrieval Augmented Generation demonstrably incorporated domain-specific knowledge from current evidence-based surgical guideline",
        "source_ref": "https://doi.org/10.1038/s41746-024-01391-3"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_888b1eb13315",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "SurgeryLLM, a large language model framework using Retrieval Augmented Generation demonstrably incorporated domain-specific knowledge from current evidence-based surgical guidelines when presented with patient-specific data. The successful incorporation of guideline-based information represents a substantial step toward enabling greater surgeon efficiency, improving patient safety, and optimizing surgical outcomes.",
    "theorem_proof_scaffolds": [],
    "title": "SurgeryLLM: a retrieval-augmented generation large language model framework for surgical decision support and workflow enhancement",
    "url": "https://doi.org/10.1038/s41746-024-01391-3",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Yu He Ke",
      "Liyuan Jin",
      "Kabilan Elangovan",
      "Hairil Rizal Abdullah",
      "Nan Liu",
      "Alex Tiong Heng Sia",
      "Chai Rick Soh",
      "Joshua Yi Min Tung",
      "Jasmine Chiat Ling Ong",
      "Chang\u2010Fu Kuo",
      "Shaochun Wu",
      "Vesela Kovacheva",
      "Daniel Shu Wei Ting"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Yu He Ke; Liyuan Jin; Kabilan Elangovan; Hairil Rizal Abdullah; Nan Liu; Alex Tiong Heng Sia; Chai Rick Soh; Joshua Yi Min Tung; Jasmine Chiat Ling Ong; Chang\u2010Fu Kuo; Shaochun Wu; Vesela Kovacheva; Daniel Shu Wei Ting (2025). Retrieval augmented generation for 10 large language models and its generalizability in assessing medical fitness. https://doi.org/10.1038/s41746-025-01519-z",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Large Language Models (LLMs) hold promise for medical applications but often lack domain-specific expertise."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large Language Models (LLMs) hold promise for medical applications but often lack domain-specific expertise.",
        "locator": "abstract",
        "provenance_snippet": "Large Language Models (LLMs) hold promise for medical applications but often lack domain-specific expertise.",
        "source_ref": "https://doi.org/10.1038/s41746-025-01519-z"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_653d8bcfbc2e",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Models (LLMs) hold promise for medical applications but often lack domain-specific expertise. Retrieval Augmented Generation (RAG) enables customization by integrating specialized knowledge.",
    "theorem_proof_scaffolds": [],
    "title": "Retrieval augmented generation for 10 large language models and its generalizability in assessing medical fitness",
    "url": "https://doi.org/10.1038/s41746-025-01519-z",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Thang D. Pham",
      "Aditya Tanikanti",
      "Murat Ke\u00e7eli"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Thang D. Pham; Aditya Tanikanti; Murat Ke\u00e7eli (2026). ChemGraph as an agentic framework for computational chemistry workflows. https://doi.org/10.1038/s42004-025-01776-9",
    "claims": [
      "We evaluate ChemGraph across 13 benchmark tasks and demonstrate that smaller LLMs (GPT-4o-mini, Claude-3.5-haiku, Qwen-2.5-14B) perform well on simple workflows, while more complex tasks benefit from using larger models.",
      "Importantly, we show that decomposing complex tasks into smaller subtasks through a multi-agent framework enables GPT-4o to reach perfect accuracy and smaller LLMs to match or exceed single-agent GPT-4o's performance in these benchmarks."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "We evaluate ChemGraph across 13 benchmark tasks and demonstrate that smaller LLMs (GPT-4o-mini, Claude-3.5-haiku, Qwen-2.5-14B) perform well on simple workflows, while more complex tasks benefit from using larger models.",
      "Importantly, we show that decomposing complex tasks into smaller subtasks through a multi-agent framework enables GPT-4o to reach perfect accuracy and smaller LLMs to match or exceed single-agent GPT-4o's performance in these benchmarks."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "We present ChemGraph, an agentic framework powered by artificial intelligence and state-of-the-art simulation tools to streamline and automate computational chemistry and materials science workflows."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We present ChemGraph, an agentic framework powered by artificial intelligence and state-of-the-art simulation tools to streamline and automate computational chemistry and materials science workflows.",
        "locator": "abstract",
        "provenance_snippet": "We present ChemGraph, an agentic framework powered by artificial intelligence and state-of-the-art simulation tools to streamline and automate computational chemistry and materials",
        "source_ref": "https://doi.org/10.1038/s42004-025-01776-9"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "We evaluate ChemGraph across 13 benchmark tasks and demonstrate that smaller LLMs (GPT-4o-mini, Claude-3.5-haiku, Qwen-2.5-14B) perform well on simple workflows, while more complex tasks benefit from using larger models.",
        "locator": "abstract",
        "provenance_snippet": "We evaluate ChemGraph across 13 benchmark tasks and demonstrate that smaller LLMs (GPT-4o-mini, Claude-3.5-haiku, Qwen-2.5-14B) perform well on simple workflows, while more complex",
        "source_ref": "https://doi.org/10.1038/s42004-025-01776-9"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_d695132510c4",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Atomistic simulations are essential in chemistry and materials science but remain challenging to run due to the expert knowledge required for the setup, execution, and validation stages of these calculations. We present ChemGraph, an agentic framework powered by artificial intelligence and state-of-the-art simulation tools to streamline and automate computational chemistry and materials science workflows.",
    "theorem_proof_scaffolds": [],
    "title": "ChemGraph as an agentic framework for computational chemistry workflows",
    "url": "https://doi.org/10.1038/s42004-025-01776-9",
    "venue": "",
    "year": 2026
  },
  {
    "assumptions": [],
    "authors": [
      "Yixing Jiang",
      "Kameron Collin Black",
      "Gloria Geng",
      "Dae-Gyun Park",
      "James Zou",
      "Andrew Y. Ng",
      "Jonathan H. Chen"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Yixing Jiang; Kameron Collin Black; Gloria Geng; Dae-Gyun Park; James Zou; Andrew Y. Ng; Jonathan H. Chen (2025). MedAgentBench: A Virtual EHR Environment to Benchmark Medical LLM Agents. https://doi.org/10.1056/aidbp2500144",
    "claims": [
      "BACKGROUND Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents, thereby surpassing their traditional role as chatbots.These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level.This suggests new potential to reduce the burden of administrative tasks and address current health care staff shortages.However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making it difficult to evaluate their performance on complex tasks in interactive health care environments.",
      "RESULTSMedAgentBench presents an unsaturated agent-oriented benchmark at which current state-of-the-art LLMs exhibit some ability to succeed.The best model (Claude 3.5 Sonnet v2) achieves a success rate of 69.67%.However, there is still substantial room for improvement, which gives the community a clear direction for future optimization efforts.Furthermore, there is significant variation in performance across task categories.CONCLUSIONS Agent-based task frameworks and benchmarks are the necessary next step to advance the potential and capabilities for effectively improving and integrating AI systems into clinical workflows.MedAgentBench establishes this and is publicly available at https://github .com /stanfordmlgroup /MedAgentBench, offering a valuable framework for model developers to track progress and drive continuous improvements in the agent capabilities of LLMs within the medical domain."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "BACKGROUND Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents, thereby surpassing their traditional role as chatbots.These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level.This suggests new potential to reduce the burden of administrative tasks and address current health care staff shortages.However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making it difficult to evaluate their performance on complex tasks in interactive health care environments.",
      "RESULTSMedAgentBench presents an unsaturated agent-oriented benchmark at which current state-of-the-art LLMs exhibit some ability to succeed.The best model (Claude 3.5 Sonnet v2) achieves a success rate of 69.67%.However, there is still substantial room for improvement, which gives the community a clear direction for future optimization efforts.Furthermore, there is significant variation in performance across task categories.CONCLUSIONS Agent-based task frameworks and benchmarks are the necessary next step to advance the potential and capabilities for effectively improving and integrating AI systems into clinical workflows.MedAgentBench establishes this and is publicly available at https://github .com /stanfordmlgroup /MedAgentBench, offering a valuable framework for model developers to track progress and drive continuous improvements in the agent capabilities of LLMs within the medical domain."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "METHODSTo address this gap in the deployment of agentic artificial intelligence (AI) in health care, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of LLMs within medical records contexts.MedAgentBench encompasses 300 patient-specific clinically derived tasks from 10 categories written by human physicians, realistic profiles of 100 patients with over 700,000 data elements, a Fast Healthcare Interoperability Resources-compliant interactive environment, and an accompanying codebase.The environment uses standard application programming interfaces and communication infrastructure used in modern electronic health record (EHR) systems so that it can be easily migrated into live EHR systems."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "METHODSTo address this gap in the deployment of agentic artificial intelligence (AI) in health care, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of LLMs within medical records contexts.MedAgentBench encompasses 300 patient-specific clinically derived tasks from 10 categories written by human physicians, realistic profiles of 100 patients with over 700,000 data elements, a Fast Healthcare Interoperability Resources-compliant interactive environment, and an accompanying codebase.The environment uses standard application programming interfaces and communication infrastructure used in modern electronic health record (EHR) systems so that it can be easily migrated into live EHR systems.",
        "locator": "abstract",
        "provenance_snippet": "METHODSTo address this gap in the deployment of agentic artificial intelligence (AI) in health care, we introduce MedAgentBench, a broad evaluation suite designed to assess the age",
        "source_ref": "https://doi.org/10.1056/aidbp2500144"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "BACKGROUND Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents, thereby surpassing their traditional role as chatbots.These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level.This suggests new potential to reduce the burden of administrative tasks and address current health care staff shortages.However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making it difficult to evaluate their performance on complex tasks in interactive health care environments.",
        "locator": "abstract",
        "provenance_snippet": "BACKGROUND Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents, thereby surpassing their traditional ro",
        "source_ref": "https://doi.org/10.1056/aidbp2500144"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_5636f52395f6",
    "key_equations": [],
    "limitations": [
      "BACKGROUND Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents, thereby surpassing their traditional role as chatbots.These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level.This suggests new potential to reduce the burden of administrative tasks and address current health care staff shortages.However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making it difficult to evaluate their performance on complex tasks in interactive health care environments.",
      "RESULTSMedAgentBench presents an unsaturated agent-oriented benchmark at which current state-of-the-art LLMs exhibit some ability to succeed.The best model (Claude 3.5 Sonnet v2) achieves a success rate of 69.67%.However, there is still substantial room for improvement, which gives the community a clear direction for future optimization efforts.Furthermore, there is significant variation in performance across task categories.CONCLUSIONS Agent-based task frameworks and benchmarks are the necessary next step to advance the potential and capabilities for effectively improving and integrating AI systems into clinical workflows.MedAgentBench establishes this and is publicly available at https://github .com /stanfordmlgroup /MedAgentBench, offering a valuable framework for model developers to track progress and drive continuous improvements in the agent capabilities of LLMs within the medical domain."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "BACKGROUND Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents, thereby surpassing their traditional role as chatbots.These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level.This suggests new potential to reduce the burden of administrative tasks and address current health care staff shortages.However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making it difficult to evaluate their performance on complex tasks in interactive health care environments. METHODSTo address this gap in the deployment of agentic artificial intelligence (AI) in health care, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of LLMs within medical records contexts.MedAgentBench encompasses 300 patient-specific clinically derived tasks from 10 categories written by human physicians, realistic profiles of 100 patients with over 700,000 data elements, a Fast Healthcare Interoperability Resources-compliant interactive environment, and an accompanying codebase.The environment uses standard application programming interfaces and communication infrastructure used in modern electronic health record (EHR) systems so that it can be easily migrated into live EHR systems.",
    "theorem_proof_scaffolds": [],
    "title": "MedAgentBench: A Virtual EHR Environment to Benchmark Medical LLM Agents",
    "url": "https://doi.org/10.1056/aidbp2500144",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Siru Liu",
      "Allison B. McCoy",
      "Adam Wright"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Siru Liu; Allison B. McCoy; Adam Wright (2025). Improving large language model applications in biomedicine with retrieval-augmented generation: a systematic review, meta-analysis, and clinical development guidelines. https://doi.org/10.1093/jamia/ocaf008",
    "claims": [
      "Overall, RAG implementation showed a 1.35 odds ratio increase in performance compared to baseline LLMs."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Overall, RAG implementation showed a 1.35 odds ratio increase in performance compared to baseline LLMs."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Overall, RAG implementation showed a 1.35 odds ratio increase in performance compared to baseline LLMs."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Overall, RAG implementation showed a 1.35 odds ratio increase in performance compared to baseline LLMs.",
        "locator": "abstract",
        "provenance_snippet": "Overall, RAG implementation showed a 1.35 odds ratio increase in performance compared to baseline LLMs.",
        "source_ref": "https://doi.org/10.1093/jamia/ocaf008"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_d784c0f0951e",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Overall, RAG implementation showed a 1.35 odds ratio increase in performance compared to baseline LLMs. Future research should focus on (1) system-level enhancement: the combination of RAG and agent, (2) knowledge-level enhancement: deep integration of knowledge into LLM, and (3) integration-level enhancement: integrating RAG systems within electronic health records.",
    "theorem_proof_scaffolds": [],
    "title": "Improving large language model applications in biomedicine with retrieval-augmented generation: a systematic review, meta-analysis, and clinical development guidelines",
    "url": "https://doi.org/10.1093/jamia/ocaf008",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Kexin Huang",
      "Serena Zhang",
      "Hanchen Wang",
      "Yuanhao Qu",
      "Yingzhou Lu",
      "Yusuf Roohani",
      "Ryan J. Li",
      "Lin Qiu",
      "G. L. Li",
      "Junze Zhang",
      "Di Yin",
      "Shruti Marwaha",
      "Jennefer N. Carter",
      "Xin\u2010Fu Zhou",
      "Matthew T. Wheeler",
      "Jonathan A. Bernstein",
      "Mengdi Wang",
      "Peng He",
      "Jingtian Zhou",
      "M Snyder",
      "Le Cong",
      "Aviv Regev",
      "Jure Leskovec"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Kexin Huang; Serena Zhang; Hanchen Wang; Yuanhao Qu; Yingzhou Lu; Yusuf Roohani; Ryan J. Li; Lin Qiu; G. L. Li; Junze Zhang; Di Yin; Shruti Marwaha; Jennefer N. Carter; Xin\u2010Fu Zhou; Matthew T. Wheeler; Jonathan A. Bernstein; Mengdi Wang; Peng He; Jingtian Zhou; M Snyder; Le Cong; Aviv Regev; Jure Leskovec (2025). Biomni: A General-Purpose Biomedical AI Agent. https://doi.org/10.1101/2025.05.30.656746",
    "claims": [
      "Systematic benchmarking demonstrates that Biomni achieves strong generalization across heterogeneous biomedical tasks - including causal gene prioritization, drug repurposing, rare disease diagnosis, microbiome analysis, and molecular cloning - without any task-specific prompt tuning.",
      "Real-world case studies further showcase Biomni's ability to interpret complex, multi-modal biomedical datasets and autonomously generate experimentally testable protocols."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Systematic benchmarking demonstrates that Biomni achieves strong generalization across heterogeneous biomedical tasks - including causal gene prioritization, drug repurposing, rare disease diagnosis, microbiome analysis, and molecular cloning - without any task-specific prompt tuning.",
      "Real-world case studies further showcase Biomni's ability to interpret complex, multi-modal biomedical datasets and autonomously generate experimentally testable protocols."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Here, we introduce Biomni, a general-purpose biomedical AI agent designed to autonomously execute a wide spectrum of research tasks across diverse biomedical subfields."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Here, we introduce Biomni, a general-purpose biomedical AI agent designed to autonomously execute a wide spectrum of research tasks across diverse biomedical subfields.",
        "locator": "abstract",
        "provenance_snippet": "Here, we introduce Biomni, a general-purpose biomedical AI agent designed to autonomously execute a wide spectrum of research tasks across diverse biomedical subfields.",
        "source_ref": "https://doi.org/10.1101/2025.05.30.656746"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "However, with the growth of complex lab experiments, large datasets, many analytical tools, and expansive literature, biomedical research is increasingly constrained by repetitive and fragmented workflows that slow discovery and limit innovation, underscoring the need for a fundamentally new way to scale scientific expertise.",
        "locator": "abstract",
        "provenance_snippet": "However, with the growth of complex lab experiments, large datasets, many analytical tools, and expansive literature, biomedical research is increasingly constrained by repetitive ",
        "source_ref": "https://doi.org/10.1101/2025.05.30.656746"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_5a2ba9a512a5",
    "key_equations": [],
    "limitations": [
      "However, with the growth of complex lab experiments, large datasets, many analytical tools, and expansive literature, biomedical research is increasingly constrained by repetitive and fragmented workflows that slow discovery and limit innovation, underscoring the need for a fundamentally new way to scale scientific expertise."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Biomedical research underpins progress in our understanding of human health and disease, drug discovery, and clinical care. However, with the growth of complex lab experiments, large datasets, many analytical tools, and expansive literature, biomedical research is increasingly constrained by repetitive and fragmented workflows that slow discovery and limit innovation, underscoring the need for a fundamentally new way to scale scientific expertise.",
    "theorem_proof_scaffolds": [],
    "title": "Biomni: A General-Purpose Biomedical AI Agent",
    "url": "https://doi.org/10.1101/2025.05.30.656746",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Zhangyin Feng",
      "Xiaocheng Feng",
      "Dezhi Zhao",
      "Maojin Yang",
      "Bing Qin"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Zhangyin Feng; Xiaocheng Feng; Dezhi Zhao; Maojin Yang; Bing Qin (2024). Retrieval-Generation Synergy Augmented Large Language Models. https://doi.org/10.1109/icassp48485.2024.10448015",
    "claims": [
      "Large language models augmented with task-relevant documents have demonstrated impressive performance on knowledge-intensive tasks.",
      "Empirical results show that our method significantly improves the reasoning ability of large language models and outperforms previous baselines."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Large language models augmented with task-relevant documents have demonstrated impressive performance on knowledge-intensive tasks.",
      "Empirical results show that our method significantly improves the reasoning ability of large language models and outperforms previous baselines."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "We propose an iterative retrieval-generation collaborative framework."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "We propose an iterative retrieval-generation collaborative framework.",
        "locator": "abstract",
        "provenance_snippet": "We propose an iterative retrieval-generation collaborative framework.",
        "source_ref": "https://doi.org/10.1109/icassp48485.2024.10448015"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "We conduct experiments on four question answering datasets, including single-hop QA and multi-hop QA tasks.",
        "locator": "abstract",
        "provenance_snippet": "We conduct experiments on four question answering datasets, including single-hop QA and multi-hop QA tasks.",
        "source_ref": "https://doi.org/10.1109/icassp48485.2024.10448015"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_745c8e50b70a",
    "key_equations": [],
    "limitations": [
      "However, regarding how to obtain effective documents, the existing methods are mainly divided into two categories."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large language models augmented with task-relevant documents have demonstrated impressive performance on knowledge-intensive tasks. However, regarding how to obtain effective documents, the existing methods are mainly divided into two categories.",
    "theorem_proof_scaffolds": [],
    "title": "Retrieval-Generation Synergy Augmented Large Language Models",
    "url": "https://doi.org/10.1109/icassp48485.2024.10448015",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Bo Lin",
      "Shangwen Wang",
      "Yihao Qin",
      "Liqian Chen",
      "Xiaoguang Mao"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Bo Lin; Shangwen Wang; Yihao Qin; Liqian Chen; Xiaoguang Mao (2025). Large Language Models-Aided Program Debloating. https://doi.org/10.1109/tse.2025.3594673",
    "claims": [
      "Extensive evaluations on widely used benchmarks demonstrate the efficacy of <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace>.",
      "These results demonstrate that <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace> surpasses the state-of-the-art tool CovA in functionality and security."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Extensive evaluations on widely used benchmarks demonstrate the efficacy of <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace>.",
      "These results demonstrate that <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace> surpasses the state-of-the-art tool CovA in functionality and security."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To address these limitations, we propose <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace>, a program debloating framework enhanced by Large Language Models (LLMs), which leverages their semantic understanding, generative capabilities, and decision-making strengths."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To address these limitations, we propose <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace>, a program debloating framework enhanced by Large Language Models (LLMs), which leverages their semantic understanding, generative capabilities, and decision-making strengths.",
        "locator": "abstract",
        "provenance_snippet": "To address these limitations, we propose <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace>, a program debloati",
        "source_ref": "https://doi.org/10.1109/tse.2025.3594673"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Extensive evaluations on widely used benchmarks demonstrate the efficacy of <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEADER</monospace>.",
        "locator": "abstract",
        "provenance_snippet": "Extensive evaluations on widely used benchmarks demonstrate the efficacy of <monospace xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">LEA",
        "source_ref": "https://doi.org/10.1109/tse.2025.3594673"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_c147ee3973f1",
    "key_equations": [],
    "limitations": [
      "As software grows in complexity to accommodate diverse features and platforms, software bloating has emerged as a significant challenge, adversely affecting performance and security.",
      "However, existing approaches inadequately address the dual objectives of debloating: maintaining functionality by preserving essential features and enhancing security by reducing security issues.",
      "However, these approaches frequently overfit provided inputs, leading to functionality loss and potential security vulnerabilities."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "As software grows in complexity to accommodate diverse features and platforms, software bloating has emerged as a significant challenge, adversely affecting performance and security. However, existing approaches inadequately address the dual objectives of debloating: maintaining functionality by preserving essential features and enhancing security by reducing security issues.",
    "theorem_proof_scaffolds": [],
    "title": "Large Language Models-Aided Program Debloating",
    "url": "https://doi.org/10.1109/tse.2025.3594673",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Daye Nam",
      "Andrew Macvean",
      "Vincent J. Hellendoorn",
      "Bogdan Vasilescu",
      "Brad A. Myers"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Daye Nam; Andrew Macvean; Vincent J. Hellendoorn; Bogdan Vasilescu; Brad A. Myers (2024). Using an LLM to Help With Code Understanding. https://doi.org/10.1145/3597503.3639187",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Understanding code is challenging, especially when working in new and complex development environments."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Understanding code is challenging, especially when working in new and complex development environments.",
        "locator": "abstract",
        "provenance_snippet": "Understanding code is challenging, especially when working in new and complex development environments.",
        "source_ref": "https://doi.org/10.1145/3597503.3639187"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "We evaluate this system in a user study with 32 participants, which confirms that using our plugin can aid task completion more than web search.",
        "locator": "abstract",
        "provenance_snippet": "We evaluate this system in a user study with 32 participants, which confirms that using our plugin can aid task completion more than web search.",
        "source_ref": "https://doi.org/10.1145/3597503.3639187"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_2cf28785637a",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Understanding code is challenging, especially when working in new and complex development environments. Code comments and documentation can help, but are typically scarce or hard to navigate.",
    "theorem_proof_scaffolds": [],
    "title": "Using an LLM to Help With Code Understanding",
    "url": "https://doi.org/10.1145/3597503.3639187",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Murray Shanahan"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Murray Shanahan (2024). Talking about Large Language Models. https://doi.org/10.1145/3624724",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Interacting with a contemporary LLM-based conversational agent can create an illusion of being in the presence of a thinking creature."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Interacting with a contemporary LLM-based conversational agent can create an illusion of being in the presence of a thinking creature.",
        "locator": "abstract",
        "provenance_snippet": "Interacting with a contemporary LLM-based conversational agent can create an illusion of being in the presence of a thinking creature.",
        "source_ref": "https://doi.org/10.1145/3624724"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_6b49268441f3",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Interacting with a contemporary LLM-based conversational agent can create an illusion of being in the presence of a thinking creature. Yet, in their very nature, such systems are fundamentally not like us.",
    "theorem_proof_scaffolds": [],
    "title": "Talking about Large Language Models",
    "url": "https://doi.org/10.1145/3624724",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Chunqiu Steven Xia",
      "Yinlin Deng",
      "S.M. Dunn",
      "Lingming Zhang"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Chunqiu Steven Xia; Yinlin Deng; S.M. Dunn; Lingming Zhang (2025). Demystifying LLM-Based Software Engineering Agents. https://doi.org/10.1145/3715754",
    "claims": [
      "Our results on the popular SWE-bench Lite benchmark show that surprisingly the simplistic Agentless is able to achieve both the highest performance (32.00%, 96 correct fixes) and low cost ($0.70) compared with all existing open-source software agents at the time of paper submission!",
      "In fact, Agentless has already been adopted by OpenAI as the go-to approach to showcase the real-world coding performance of both GPT-4o and the new o1 models; more recently, Agentless has also been used by DeepSeek to evaluate their newest DeepSeek V3 and R1 models."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Our results on the popular SWE-bench Lite benchmark show that surprisingly the simplistic Agentless is able to achieve both the highest performance (32.00%, 96 correct fixes) and low cost ($0.70) compared with all existing open-source software agents at the time of paper submission!",
      "In fact, Agentless has already been adopted by OpenAI as the go-to approach to showcase the real-world coding performance of both GPT-4o and the new o1 models; more recently, Agentless has also been used by DeepSeek to evaluate their newest DeepSeek V3 and R1 models."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Recent advancements in large language models (LLMs) have significantly advanced the automation of software development tasks, including code synthesis, program repair, and test generation."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Recent advancements in large language models (LLMs) have significantly advanced the automation of software development tasks, including code synthesis, program repair, and test generation.",
        "locator": "abstract",
        "provenance_snippet": "Recent advancements in large language models (LLMs) have significantly advanced the automation of software development tasks, including code synthesis, program repair, and test gen",
        "source_ref": "https://doi.org/10.1145/3715754"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Our results on the popular SWE-bench Lite benchmark show that surprisingly the simplistic Agentless is able to achieve both the highest performance (32.00%, 96 correct fixes) and low cost ($0.70) compared with all existing open-source software agents at the time of paper submission!",
        "locator": "abstract",
        "provenance_snippet": "Our results on the popular SWE-bench Lite benchmark show that surprisingly the simplistic Agentless is able to achieve both the highest performance (32.00%, 96 correct fixes) and l",
        "source_ref": "https://doi.org/10.1145/3715754"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [
      "We hope Agentless will help reset the baseline, starting point, and horizon for autonomous software agents, and inspire future work along this crucial direction."
    ],
    "id": "src_7276cb3f4e25",
    "key_equations": [],
    "limitations": [
      "However, the complexity of these agent-based approaches, together with the limited abilities of current LLMs, raises the following question: Do we really have to employ complex autonomous software agents?"
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Recent advancements in large language models (LLMs) have significantly advanced the automation of software development tasks, including code synthesis, program repair, and test generation. More recently, researchers and industry practitioners have developed various autonomous LLM agents to perform end-to-end software development tasks.",
    "theorem_proof_scaffolds": [],
    "title": "Demystifying LLM-Based Software Engineering Agents",
    "url": "https://doi.org/10.1145/3715754",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Lameck Mbangula Amugongo",
      "Pietro Mascheroni",
      "Steven E. Brooks",
      "Stefan Doering",
      "Jan Seidel"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Lameck Mbangula Amugongo; Pietro Mascheroni; Steven E. Brooks; Stefan Doering; Jan Seidel (2025). Retrieval augmented generation for large language models in healthcare: A systematic review. https://doi.org/10.1371/journal.pdig.0000877",
    "claims": [
      "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare.",
      "Our synthesis shows that 78.9% of studies used English datasets and 21.1% of the datasets are in Chinese."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare.",
      "Our synthesis shows that 78.9% of studies used English datasets and 21.1% of the datasets are in Chinese."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare.",
        "locator": "abstract",
        "provenance_snippet": "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare.",
        "source_ref": "https://doi.org/10.1371/journal.pdig.0000877"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_bf6ff4aab9fe",
    "key_equations": [],
    "limitations": [
      "However, LLMs are limited by their training data which is often outdated, the tendency to generate inaccurate (\"hallucinated\") content and a lack of transparency in the content they generate.",
      "To address these limitations, retrieval augmented generation (RAG) grounds the responses of LLMs by exposing them to external knowledge sources.",
      "However, in the healthcare domain there is currently a lack of systematic understanding of which datasets, RAG methodologies and evaluation frameworks are available."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare. However, LLMs are limited by their training data which is often outdated, the tendency to generate inaccurate (\"hallucinated\") content and a lack of transparency in the content they generate.",
    "theorem_proof_scaffolds": [],
    "title": "Retrieval augmented generation for large language models in healthcare: A systematic review",
    "url": "https://doi.org/10.1371/journal.pdig.0000877",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Kurnia Muludi",
      "Kaira Milani Fitria",
      "Joko Triloka",
      "Sutedi Sutedi"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Kurnia Muludi; Kaira Milani Fitria; Joko Triloka; Sutedi Sutedi (2024). Retrieval-Augmented Generation Approach: Document Question Answering using Large Language Model. https://doi.org/10.14569/ijacsa.2024.0150379",
    "claims": [
      "This study introduces the Retrieval Augmented Generation (RAG) method to improve Question-Answering (QA) systems by addressing document processing in Natural Language Processing problems.",
      "Results highlight RAG's superiority: achieving a precision of 0.74 in Recall-Oriented Understudy for Gisting Evaluation (ROUGE) testing, outperforming others at 0.5; obtaining an F1 score of 0.88 in BERTScore, surpassing other QA apps at 0.81; attaining a precision of 0.28 in Bilingual Evaluation Understudy (BLEU) testing, surpassing others with a precision of 0.09; and scoring 0.33 in Jaccard Similarity, outshining others at 0.04."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "This study introduces the Retrieval Augmented Generation (RAG) method to improve Question-Answering (QA) systems by addressing document processing in Natural Language Processing problems.",
      "Results highlight RAG's superiority: achieving a precision of 0.74 in Recall-Oriented Understudy for Gisting Evaluation (ROUGE) testing, outperforming others at 0.5; obtaining an F1 score of 0.88 in BERTScore, surpassing other QA apps at 0.81; attaining a precision of 0.28 in Bilingual Evaluation Understudy (BLEU) testing, surpassing others with a precision of 0.09; and scoring 0.33 in Jaccard Similarity, outshining others at 0.04."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "This study introduces the Retrieval Augmented Generation (RAG) method to improve Question-Answering (QA) systems by addressing document processing in Natural Language Processing problems."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "This study introduces the Retrieval Augmented Generation (RAG) method to improve Question-Answering (QA) systems by addressing document processing in Natural Language Processing problems.",
        "locator": "abstract",
        "provenance_snippet": "This study introduces the Retrieval Augmented Generation (RAG) method to improve Question-Answering (QA) systems by addressing document processing in Natural Language Processing pr",
        "source_ref": "https://doi.org/10.14569/ijacsa.2024.0150379"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "The research evaluates RAG's that use Generative Pre-trained Transformer 3.5 or GPT-3.5-turbo from the ChatGPT model and its impact on document data processing, comparing it with other applications.",
        "locator": "abstract",
        "provenance_snippet": "The research evaluates RAG's that use Generative Pre-trained Transformer 3.5 or GPT-3.5-turbo from the ChatGPT model and its impact on document data processing, comparing it with o",
        "source_ref": "https://doi.org/10.14569/ijacsa.2024.0150379"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_5e2fa6bd5e8f",
    "key_equations": [],
    "limitations": [
      "RAG combines search techniques in vector store and text generation mechanism developed by Large Language Models, offering a time-efficient alternative to manual reading limitations."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "This study introduces the Retrieval Augmented Generation (RAG) method to improve Question-Answering (QA) systems by addressing document processing in Natural Language Processing problems. It represents the latest breakthrough in applying RAG to document question and answer applications, overcoming previous QA system obstacles.",
    "theorem_proof_scaffolds": [],
    "title": "Retrieval-Augmented Generation Approach: Document Question Answering using Large Language Model",
    "url": "https://doi.org/10.14569/ijacsa.2024.0150379",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Yufeng Zhang",
      "Xuepeng Wang",
      "Lingxiang Wu",
      "Jinqiao Wang"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Yufeng Zhang; Xuepeng Wang; Lingxiang Wu; Jinqiao Wang (2025). Enhancing Chain of Thought Prompting in Large Language Models via Reasoning Patterns. https://doi.org/10.1609/aaai.v39i24.34793",
    "claims": [
      "Extensive experiments demonstrate that our method is more robust and consistently leads to improvements across various reasoning tasks."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Extensive experiments demonstrate that our method is more robust and consistently leads to improvements across various reasoning tasks."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "In this paper, we propose leveraging reasoning patterns to enhance CoT prompting effectiveness."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "In this paper, we propose leveraging reasoning patterns to enhance CoT prompting effectiveness.",
        "locator": "abstract",
        "provenance_snippet": "In this paper, we propose leveraging reasoning patterns to enhance CoT prompting effectiveness.",
        "source_ref": "https://doi.org/10.1609/aaai.v39i24.34793"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Extensive experiments demonstrate that our method is more robust and consistently leads to improvements across various reasoning tasks.",
        "locator": "abstract",
        "provenance_snippet": "Extensive experiments demonstrate that our method is more robust and consistently leads to improvements across various reasoning tasks.",
        "source_ref": "https://doi.org/10.1609/aaai.v39i24.34793"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_8b0217b0cbdb",
    "key_equations": [],
    "limitations": [],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Chain of Thought (CoT) prompting can encourage language models to engage in multi-step logical reasoning. The quality of the provided demonstrations significantly influences the success of downstream inference tasks.",
    "theorem_proof_scaffolds": [],
    "title": "Enhancing Chain of Thought Prompting in Large Language Models via Reasoning Patterns",
    "url": "https://doi.org/10.1609/aaai.v39i24.34793",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Danial Kamali",
      "Elham J. Barezi",
      "Parisa Kordjamshidi"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Danial Kamali; Elham J. Barezi; Parisa Kordjamshidi (2025). NeSyCoCo: A Neuro-Symbolic Concept Composer for Compositional Generalization. https://doi.org/10.1609/aaai.v39i4.32439",
    "claims": [
      "Neuro-symbolic approaches have demonstrated promise in capturing compositional structures, but they face critical challenges: (a) reliance on predefined predicates for symbolic representations that limit adaptability, (b) difficulty in extracting predicates from raw data, and (c) using non-differentiable operations for combining primitive concepts.",
      "Our framework achieves state-of-the-art results on the ReaSCAN and CLEVR-CoGenT compositional generalization benchmarks and demonstrates robust performance with novel concepts in the CLEVR-SYN benchmark."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Neuro-symbolic approaches have demonstrated promise in capturing compositional structures, but they face critical challenges: (a) reliance on predefined predicates for symbolic representations that limit adaptability, (b) difficulty in extracting predicates from raw data, and (c) using non-differentiable operations for combining primitive concepts.",
      "Our framework achieves state-of-the-art results on the ReaSCAN and CLEVR-CoGenT compositional generalization benchmarks and demonstrates robust performance with novel concepts in the CLEVR-SYN benchmark."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "To address these issues, we propose NeSyCoCo, a neuro-symbolic framework that leverages large language models (LLMs) to generate symbolic representations and map them to differentiable neural computations."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "To address these issues, we propose NeSyCoCo, a neuro-symbolic framework that leverages large language models (LLMs) to generate symbolic representations and map them to differentiable neural computations.",
        "locator": "abstract",
        "provenance_snippet": "To address these issues, we propose NeSyCoCo, a neuro-symbolic framework that leverages large language models (LLMs) to generate symbolic representations and map them to differenti",
        "source_ref": "https://doi.org/10.1609/aaai.v39i4.32439"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Our framework achieves state-of-the-art results on the ReaSCAN and CLEVR-CoGenT compositional generalization benchmarks and demonstrates robust performance with novel concepts in the CLEVR-SYN benchmark.",
        "locator": "abstract",
        "provenance_snippet": "Our framework achieves state-of-the-art results on the ReaSCAN and CLEVR-CoGenT compositional generalization benchmarks and demonstrates robust performance with novel concepts in t",
        "source_ref": "https://doi.org/10.1609/aaai.v39i4.32439"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_db5eb4358891",
    "key_equations": [],
    "limitations": [
      "Neuro-symbolic approaches have demonstrated promise in capturing compositional structures, but they face critical challenges: (a) reliance on predefined predicates for symbolic representations that limit adaptability, (b) difficulty in extracting predicates from raw data, and (c) using non-differentiable operations for combining primitive concepts."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Compositional generalization is crucial for artificial intelligence agents to solve complex vision-language reasoning tasks. Neuro-symbolic approaches have demonstrated promise in capturing compositional structures, but they face critical challenges: (a) reliance on predefined predicates for symbolic representations that limit adaptability, (b) difficulty in extracting predicates from raw data, and (c) using non-differentiable operations for combining primitive concepts.",
    "theorem_proof_scaffolds": [],
    "title": "NeSyCoCo: A Neuro-Symbolic Concept Composer for Compositional Generalization",
    "url": "https://doi.org/10.1609/aaai.v39i4.32439",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "\u4fe1\u4e4b \u548c\u6cc9",
      "Marco Valentino",
      "Louise A. Dennis",
      "Andr\u00e9 Freitas"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "\u4fe1\u4e4b \u548c\u6cc9; Marco Valentino; Louise A. Dennis; Andr\u00e9 Freitas (2024). Verification and Refinement of Natural Language Explanations through LLM-Symbolic Theorem Proving. https://doi.org/10.18653/v1/2024.emnlp-main.172",
    "claims": [
      "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains.",
        "locator": "abstract",
        "provenance_snippet": "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanati",
        "source_ref": "https://doi.org/10.18653/v1/2024.emnlp-main.172"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains.",
        "locator": "abstract",
        "provenance_snippet": "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanati",
        "source_ref": "https://doi.org/10.18653/v1/2024.emnlp-main.172"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_b757186159f8",
    "key_equations": [],
    "limitations": [
      "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models.However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors.To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs).Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI.In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements.We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains. 1",
    "theorem_proof_scaffolds": [],
    "title": "Verification and Refinement of Natural Language Explanations through LLM-Symbolic Theorem Proving",
    "url": "https://doi.org/10.18653/v1/2024.emnlp-main.172",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Weizhou Shen",
      "Chenliang Li",
      "Hongzhan Chen",
      "Ming Yan",
      "Xiaojun Quan",
      "Hehong Chen",
      "Ji Zhang",
      "Fei Huang"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Weizhou Shen; Chenliang Li; Hongzhan Chen; Ming Yan; Xiaojun Quan; Hehong Chen; Ji Zhang; Fei Huang (2024). Small LLMs Are Weak Tool Learners: A Multi-LLM Agent. https://doi.org/10.18653/v1/2024.emnlp-main.929",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete various tasks in a self-directed fashion.The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.While traditional works focus on training a single LLM with all these capabilities, performance limitations become apparent, particularly with smaller models.To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.Each component is implemented by a single LLM that focuses on a specific capability and collaborates with others to accomplish the task.This modular framework facilitates individual updates and the potential use of smaller LLMs for building each capability.To effectively train this framework, we introduce a two-stage training paradigm.First, we fine-tune a backbone LLM on the entire dataset without discriminating sub-tasks, providing the model with a comprehensive understanding of the task.Second, the fine-tuned LLM is used to instantiate the planner, caller, and summarizer respectively, which are continually fine-tuned on respective sub-tasks.Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advantages in tool learning."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete various tasks in a self-directed fashion.The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.While traditional works focus on training a single LLM with all these capabilities, performance limitations become apparent, particularly with smaller models.To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.Each component is implemented by a single LLM that focuses on a specific capability and collaborates with others to accomplish the task.This modular framework facilitates individual updates and the potential use of smaller LLMs for building each capability.To effectively train this framework, we introduce a two-stage training paradigm.First, we fine-tune a backbone LLM on the entire dataset without discriminating sub-tasks, providing the model with a comprehensive understanding of the task.Second, the fine-tuned LLM is used to instantiate the planner, caller, and summarizer respectively, which are continually fine-tuned on respective sub-tasks.Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advantages in tool learning.",
        "locator": "abstract",
        "provenance_snippet": "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete var",
        "source_ref": "https://doi.org/10.18653/v1/2024.emnlp-main.929"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete various tasks in a self-directed fashion.The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.While traditional works focus on training a single LLM with all these capabilities, performance limitations become apparent, particularly with smaller models.To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.Each component is implemented by a single LLM that focuses on a specific capability and collaborates with others to accomplish the task.This modular framework facilitates individual updates and the potential use of smaller LLMs for building each capability.To effectively train this framework, we introduce a two-stage training paradigm.First, we fine-tune a backbone LLM on the entire dataset without discriminating sub-tasks, providing the model with a comprehensive understanding of the task.Second, the fine-tuned LLM is used to instantiate the planner, caller, and summarizer respectively, which are continually fine-tuned on respective sub-tasks.Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advantages in tool learning.",
        "locator": "abstract",
        "provenance_snippet": "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete var",
        "source_ref": "https://doi.org/10.18653/v1/2024.emnlp-main.929"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_6702ca722658",
    "key_equations": [],
    "limitations": [
      "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete various tasks in a self-directed fashion.The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.While traditional works focus on training a single LLM with all these capabilities, performance limitations become apparent, particularly with smaller models.To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.Each component is implemented by a single LLM that focuses on a specific capability and collaborates with others to accomplish the task.This modular framework facilitates individual updates and the potential use of smaller LLMs for building each capability.To effectively train this framework, we introduce a two-stage training paradigm.First, we fine-tune a backbone LLM on the entire dataset without discriminating sub-tasks, providing the model with a comprehensive understanding of the task.Second, the fine-tuned LLM is used to instantiate the planner, caller, and summarizer respectively, which are continually fine-tuned on respective sub-tasks.Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advantages in tool learning."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Model (LLM) agents significantly extend the capabilities of standalone LLMs, empowering them to interact with external tools (e.g., APIs, functions) and complete various tasks in a self-directed fashion.The challenge of tool use demands that LLMs not only understand user queries and generate answers accurately but also excel in task planning, tool invocation, and result summarization.While traditional works focus on training a single LLM with all these capabilities, performance limitations become apparent, particularly with smaller models.To overcome these challenges, we propose a novel approach that decomposes the aforementioned capabilities into a planner, caller, and summarizer.Each component is implemented by a single LLM that focuses on a specific capability and collaborates with others to accomplish the task.This modular framework facilitates individual updates and the potential use of smaller LLMs for building each capability.To effectively train this framework, we introduce a two-stage training paradigm.First, we fine-tune a backbone LLM on the entire dataset without discriminating sub-tasks, providing the model with a comprehensive understanding of the task.Second, the fine-tuned LLM is used to instantiate the planner, caller, and summarizer respectively, which are continually fine-tuned on respective sub-tasks.Evaluation across various tool-use benchmarks illustrates that our proposed multi-LLM framework surpasses the traditional single-LLM approach, highlighting its efficacy and advantages in tool learning.",
    "theorem_proof_scaffolds": [],
    "title": "Small LLMs Are Weak Tool Learners: A Multi-LLM Agent",
    "url": "https://doi.org/10.18653/v1/2024.emnlp-main.929",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [],
    "authors": [
      "Jialin Liu",
      "Changyu Wang",
      "Changyu Wang",
      "Siru Liu"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Jialin Liu; Changyu Wang; Changyu Wang; Siru Liu (2025). Prompt Engineering in Clinical Practice: Tutorial for Clinicians. https://doi.org/10.2196/72644",
    "claims": [
      "Large language models (LLMs), such as OpenAI's GPT series and Google's PaLM, are transforming health care by improving clinical decision-making, enhancing patient communication, and simplifying administrative tasks.",
      "This framework helps clinicians leverage LLMs to improve decision-making, streamline documentation, and enhance patient communication while maintaining ethical standards and ensuring patient safety."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Large language models (LLMs), such as OpenAI's GPT series and Google's PaLM, are transforming health care by improving clinical decision-making, enhancing patient communication, and simplifying administrative tasks.",
      "This framework helps clinicians leverage LLMs to improve decision-making, streamline documentation, and enhance patient communication while maintaining ethical standards and ensuring patient safety."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "Large language models (LLMs), such as OpenAI's GPT series and Google's PaLM, are transforming health care by improving clinical decision-making, enhancing patient communication, and simplifying administrative tasks."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "Large language models (LLMs), such as OpenAI's GPT series and Google's PaLM, are transforming health care by improving clinical decision-making, enhancing patient communication, and simplifying administrative tasks.",
        "locator": "abstract",
        "provenance_snippet": "Large language models (LLMs), such as OpenAI's GPT series and Google's PaLM, are transforming health care by improving clinical decision-making, enhancing patient communication, an",
        "source_ref": "https://doi.org/10.2196/72644"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_5b90080b5321",
    "key_equations": [],
    "limitations": [
      "However, their performance relies heavily on prompt design, as small changes in wording or structure can greatly impact output quality.",
      "This presents challenges for clinicians who are not experts in natural language processing (NLP)."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large language models (LLMs), such as OpenAI's GPT series and Google's PaLM, are transforming health care by improving clinical decision-making, enhancing patient communication, and simplifying administrative tasks. However, their performance relies heavily on prompt design, as small changes in wording or structure can greatly impact output quality.",
    "theorem_proof_scaffolds": [],
    "title": "Prompt Engineering in Clinical Practice: Tutorial for Clinicians",
    "url": "https://doi.org/10.2196/72644",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Xin Yang",
      "Jie-Jing Shao",
      "Lan-Zhe Guo",
      "B. Zhang",
      "Zhi Zhou",
      "Lin-Han Jia",
      "Wang-Zhou Dai",
      "Yu-Feng Li"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Xin Yang; Jie-Jing Shao; Lan-Zhe Guo; B. Zhang; Zhi Zhou; Lin-Han Jia; Wang-Zhou Dai; Yu-Feng Li (2025). Neuro-Symbolic Artificial Intelligence: Towards Improving the Reasoning Abilities of Large Language Models. https://doi.org/10.24963/ijcai.2025/1195",
    "claims": [
      "Large Language Models (LLMs) have shown promising results across various tasks, yet their reasoning capabilities remain a fundamental challenge.",
      "Then, we discuss neuro-symbolic methods for improving the reasoning capabilities of LLMs from three perspectives: Symbolic-&gt;LLM, LLM-&gt;Symbolic, and LLM+Symbolic."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Large Language Models (LLMs) have shown promising results across various tasks, yet their reasoning capabilities remain a fundamental challenge.",
      "Then, we discuss neuro-symbolic methods for improving the reasoning capabilities of LLMs from three perspectives: Symbolic-&gt;LLM, LLM-&gt;Symbolic, and LLM+Symbolic."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "This paper comprehensively reviews recent developments in neuro-symbolic approaches for enhancing LLM reasoning."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "This paper comprehensively reviews recent developments in neuro-symbolic approaches for enhancing LLM reasoning.",
        "locator": "abstract",
        "provenance_snippet": "This paper comprehensively reviews recent developments in neuro-symbolic approaches for enhancing LLM reasoning.",
        "source_ref": "https://doi.org/10.24963/ijcai.2025/1195"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_69018508daf9",
    "key_equations": [],
    "limitations": [
      "Large Language Models (LLMs) have shown promising results across various tasks, yet their reasoning capabilities remain a fundamental challenge.",
      "Finally, we discuss several key challenges and promising future directions."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "Large Language Models (LLMs) have shown promising results across various tasks, yet their reasoning capabilities remain a fundamental challenge. Developing AI systems with strong reasoning capabilities is regarded as a crucial milestone in the pursuit of Artificial General Intelligence (AGI) and has garnered considerable attention from both academia and industry.",
    "theorem_proof_scaffolds": [],
    "title": "Neuro-Symbolic Artificial Intelligence: Towards Improving the Reasoning Abilities of Large Language Models",
    "url": "https://doi.org/10.24963/ijcai.2025/1195",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Seyed Mahmoud Sajjadi Mohammadabadi",
      "Burak Cem Kara",
      "Can Ey\u00fcpo\u011flu",
      "Can Uzay",
      "Mehmet Serkan Tosun",
      "Oktay Karaku\u015f"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Seyed Mahmoud Sajjadi Mohammadabadi; Burak Cem Kara; Can Ey\u00fcpo\u011flu; Can Uzay; Mehmet Serkan Tosun; Oktay Karaku\u015f (2025). A Survey of Large Language Models: Evolution, Architectures, Adaptation, Benchmarking, Applications, Challenges, and Societal Implications. https://doi.org/10.3390/electronics14183580",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "This survey provides an in-depth review of large language models (LLMs), highlighting the significant paradigm shift they represent in artificial intelligence."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "This survey provides an in-depth review of large language models (LLMs), highlighting the significant paradigm shift they represent in artificial intelligence.",
        "locator": "abstract",
        "provenance_snippet": "This survey provides an in-depth review of large language models (LLMs), highlighting the significant paradigm shift they represent in artificial intelligence.",
        "source_ref": "https://doi.org/10.3390/electronics14183580"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "The methodology involves a thorough survey of real-world LLM applications across the scientific, engineering, healthcare, and creative sectors, coupled with a review of current benchmarks.",
        "locator": "abstract",
        "provenance_snippet": "The methodology involves a thorough survey of real-world LLM applications across the scientific, engineering, healthcare, and creative sectors, coupled with a review of current ben",
        "source_ref": "https://doi.org/10.3390/electronics14183580"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_c8044018d572",
    "key_equations": [],
    "limitations": [
      "Ultimately, we conclude that overcoming these complex technical, economic, and social challenges necessitates collaborative advancements in adaptation, evaluation, infrastructure, and governance."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "This survey provides an in-depth review of large language models (LLMs), highlighting the significant paradigm shift they represent in artificial intelligence. Our purpose is to consolidate state-of-the-art advances in LLM design, training, adaptation, evaluation, and application for both researchers and practitioners.",
    "theorem_proof_scaffolds": [],
    "title": "A Survey of Large Language Models: Evolution, Architectures, Adaptation, Benchmarking, Applications, Challenges, and Societal Implications",
    "url": "https://doi.org/10.3390/electronics14183580",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "Jorge Cisneros-Gonz\u00e1lez",
      "Natalia Gordo-Herrera",
      "Iv\u00e1n Barcia-Santos",
      "Javier S\u00e1nchez-Soriano"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "Jorge Cisneros-Gonz\u00e1lez; Natalia Gordo-Herrera; Iv\u00e1n Barcia-Santos; Javier S\u00e1nchez-Soriano (2025). JorGPT: Instructor-Aided Grading of Programming Assignments with Large Language Models (LLMs). https://doi.org/10.3390/fi17060265",
    "claims": [
      "Specifically, the reduced model using statistically significant variables demonstrates a high explanatory power, with an adjusted R2 of 0.9156 and a Mean Absolute Error of 0.4579, indicating that LLMs can effectively replicate human grading.",
      "The findings suggest that LLMs can automate grading when paired with human oversight, drastically reducing the instructor workload, transforming a task estimated to take more than 300 h of manual work into less than 15 min of automated processing and improving the efficiency and consistency of assessment in computer science education."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Specifically, the reduced model using statistically significant variables demonstrates a high explanatory power, with an adjusted R2 of 0.9156 and a Mean Absolute Error of 0.4579, indicating that LLMs can effectively replicate human grading.",
      "The findings suggest that LLMs can automate grading when paired with human oversight, drastically reducing the instructor workload, transforming a task estimated to take more than 300 h of manual work into less than 15 min of automated processing and improving the efficiency and consistency of assessment in computer science education."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "This paper explores the application of large language models (LLMs) to automate the evaluation of programming assignments in an undergraduate \u201cIntroduction to Programming\u201d course."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "This paper explores the application of large language models (LLMs) to automate the evaluation of programming assignments in an undergraduate \u201cIntroduction to Programming\u201d course.",
        "locator": "abstract",
        "provenance_snippet": "This paper explores the application of large language models (LLMs) to automate the evaluation of programming assignments in an undergraduate \u201cIntroduction to Programming\u201d course.",
        "source_ref": "https://doi.org/10.3390/fi17060265"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "A comparative analysis, using LLMs from OpenAI, Google, DeepSeek and ALIBABA to evaluate student code submissions, revealed a strong correlation between LLM-generated grades and those assigned by human instructors.",
        "locator": "abstract",
        "provenance_snippet": "A comparative analysis, using LLMs from OpenAI, Google, DeepSeek and ALIBABA to evaluate student code submissions, revealed a strong correlation between LLM-generated grades and th",
        "source_ref": "https://doi.org/10.3390/fi17060265"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_5836e3a1777e",
    "key_equations": [],
    "limitations": [
      "This study addresses the challenges of manual grading, including time constraints and potential inconsistencies, by proposing a system that integrates several LLMs to streamline the assessment process."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "This paper explores the application of large language models (LLMs) to automate the evaluation of programming assignments in an undergraduate \u201cIntroduction to Programming\u201d course. This study addresses the challenges of manual grading, including time constraints and potential inconsistencies, by proposing a system that integrates several LLMs to streamline the assessment process.",
    "theorem_proof_scaffolds": [],
    "title": "JorGPT: Instructor-Aided Grading of Programming Assignments with Large Language Models (LLMs)",
    "url": "https://doi.org/10.3390/fi17060265",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "DeepSeek-AI",
      "Daya Guo",
      "Dejian Yang",
      "Haowei Zhang",
      "Junxiao Song",
      "Ruoyu Zhang",
      "Runxin Xu",
      "Qihao Zhu",
      "Shirong Ma",
      "Peiyi Wang",
      "Xiao Bi",
      "Xiaokang Zhang",
      "Xingkai Yu",
      "Yu Wu",
      "Zhenhua Wu",
      "Zhibin Gou",
      "Zhihong Shao",
      "Zhuoshu Li",
      "Ziyi Gao",
      "Aixin Liu",
      "Bing Xue",
      "Bingxuan Wang",
      "Bowen Wu",
      "Bei Feng",
      "Chengda Lu",
      "Chenggang Zhao",
      "Chengqi Deng",
      "Chenyu Zhang",
      "Chong Ruan",
      "Damai Dai",
      "Deli Chen",
      "Dongjie Ji",
      "Erhang Li",
      "Fangyun Lin",
      "Fengze Dai",
      "Fuli Luo",
      "Guangbo Hao",
      "Guan-Ting Chen",
      "Guowei Li",
      "Hongjun Zhang",
      "Han Bao",
      "Hanwei Xu",
      "Haocheng Wang",
      "Honghui Ding",
      "Huajian Xin",
      "Huazuo Gao",
      "Hui Qu",
      "Hui Li",
      "Jianzhong Guo",
      "Jiashi Li",
      "Jiawei Wang",
      "Jingchang Chen",
      "Jingyang Yuan",
      "Junjie Qiu",
      "Junlong Li",
      "Jiali Cai",
      "Jiaqi Ni",
      "Jian Liang",
      "Jing Chen",
      "Kai Dong",
      "Kai Hu",
      "Kaige Gao",
      "Kang Guan",
      "Kexin Huang",
      "Kuai Yu",
      "Lean Wang",
      "Lecong Zhang",
      "Liang Zhao",
      "Litong Wang",
      "Liyue Zhang",
      "Lei Xu",
      "L. Xia",
      "Mingchuan Zhang",
      "Minghua Zhang",
      "Minghui Tang",
      "Meng Li",
      "Miaojun Wang",
      "Mingming Li",
      "Ning Tian",
      "Panpan Huang",
      "Peng Zhang",
      "Qiancheng Wang",
      "Qinyu Chen",
      "Qiushi Du",
      "Ruiqi Ge",
      "Ruisong Zhang",
      "Rui\u2010Le Pan",
      "Runji Wang",
      "R. J. Chen",
      "Rong Jin",
      "Ruyi Chen",
      "Shanghao Lu",
      "Shangyan Zhou",
      "Shanhuang Chen",
      "Shengfeng Ye",
      "Shiyu Wang",
      "Shuiping Yu",
      "Shunfeng Zhou",
      "Shuting Pan",
      "Sansan Li"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "DeepSeek-AI; Daya Guo; Dejian Yang; Haowei Zhang; Junxiao Song; Ruoyu Zhang; Runxin Xu; Qihao Zhu; Shirong Ma; Peiyi Wang; Xiao Bi; Xiaokang Zhang; Xingkai Yu; Yu Wu; Zhenhua Wu; Zhibin Gou; Zhihong Shao; Zhuoshu Li; Ziyi Gao; Aixin Liu; Bing Xue; Bingxuan Wang; Bowen Wu; Bei Feng; Chengda Lu; Chenggang Zhao; Chengqi Deng; Chenyu Zhang; Chong Ruan; Damai Dai; Deli Chen; Dongjie Ji; Erhang Li; Fangyun Lin; Fengze Dai; Fuli Luo; Guangbo Hao; Guan-Ting Chen; Guowei Li; Hongjun Zhang; Han Bao; Hanwei Xu; Haocheng Wang; Honghui Ding; Huajian Xin; Huazuo Gao; Hui Qu; Hui Li; Jianzhong Guo; Jiashi Li; Jiawei Wang; Jingchang Chen; Jingyang Yuan; Junjie Qiu; Junlong Li; Jiali Cai; Jiaqi Ni; Jian Liang; Jing Chen; Kai Dong; Kai Hu; Kaige Gao; Kang Guan; Kexin Huang; Kuai Yu; Lean Wang; Lecong Zhang; Liang Zhao; Litong Wang; Liyue Zhang; Lei Xu; L. Xia; Mingchuan Zhang; Minghua Zhang; Minghui Tang; Meng Li; Miaojun Wang; Mingming Li; Ning Tian; Panpan Huang; Peng Zhang; Qiancheng Wang; Qinyu Chen; Qiushi Du; Ruiqi Ge; Ruisong Zhang; Rui\u2010Le Pan; Runji Wang; R. J. Chen; Rong Jin; Ruyi Chen; Shanghao Lu; Shangyan Zhou; Shanhuang Chen; Shengfeng Ye; Shiyu Wang; Shuiping Yu; Shunfeng Zhou; Shuting Pan; Sansan Li (2025). Can Open Large Language Models Catch Vulnerabilities?. https://doi.org/10.4230/oasics.icpec.2025.4",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "As Large Language Models (LLMs) become increasingly integrated into secure software development workflows, a critical question remains unanswered: can these models not only detect insecure code but also reliably classify vulnerabilities according to standardized taxonomies?"
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "As Large Language Models (LLMs) become increasingly integrated into secure software development workflows, a critical question remains unanswered: can these models not only detect insecure code but also reliably classify vulnerabilities according to standardized taxonomies?",
        "locator": "abstract",
        "provenance_snippet": "As Large Language Models (LLMs) become increasingly integrated into secure software development workflows, a critical question remains unanswered: can these models not only detect ",
        "source_ref": "https://doi.org/10.4230/oasics.icpec.2025.4"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_cd0174c2a064",
    "key_equations": [],
    "limitations": [
      "Moreover, we analyze model-specific biases and common failure modes, shedding light on the limitations of current LLMs in performing fine-grained security reasoning.These insights are especially relevant in educational contexts, where LLMs are being adopted as learning aids despite their limitations.",
      "Our results expose key challenges that must be addressed before LLMs can be reliably deployed in security-sensitive environments."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "As Large Language Models (LLMs) become increasingly integrated into secure software development workflows, a critical question remains unanswered: can these models not only detect insecure code but also reliably classify vulnerabilities according to standardized taxonomies? In this work, we conduct a systematic evaluation of three state-of-the-art LLMs - Llama3, Codestral, and Deepseek R1 - using a carefully filtered subset of the Big-Vul dataset annotated with eight representative Common Weakness Enumeration categories.",
    "theorem_proof_scaffolds": [],
    "title": "Can Open Large Language Models Catch Vulnerabilities?",
    "url": "https://doi.org/10.4230/oasics.icpec.2025.4",
    "venue": "",
    "year": 2025
  },
  {
    "assumptions": [],
    "authors": [
      "David Aguero",
      "Scott D. Nelson"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "David Aguero; Scott D. Nelson (2024). The Potential Application of Large Language Models in Pharmaceutical Supply Chain Management. https://doi.org/10.5863/1551-6776-29.2.200",
    "claims": [
      "Issues such as shortage mitigation, inconsistency in regulatory compliance, cost control, cold-chain storage, adaptation to technologic advances, and secure information sharing all pose significant difficulties to pharmaceutical supply chain management.1 Consequently, this adds immense pressure on health system pharmacies to enhance their arsenal of available tools.2Pediatric hospitals in particular encounter unique challenges relating to pharmaceutical and supply shortages, which can cause delays in vital patient procedures, alterations in care protocols, and even unexpected changes in care locations.Historically, supply chain operations have been improved by using physical automation technologies and methodologies such as robotic process automation.",
      "LLMs can theoretically be used to help improve supply chain resilience by simulating disruption scenarios to help generate and improve risk-management plans.",
      "One possible way to overcome these challenges is by using an application programing interface (API) to improve the prompt."
    ],
    "comparator_lineage": [],
    "conclusions": [
      "Issues such as shortage mitigation, inconsistency in regulatory compliance, cost control, cold-chain storage, adaptation to technologic advances, and secure information sharing all pose significant difficulties to pharmaceutical supply chain management.1 Consequently, this adds immense pressure on health system pharmacies to enhance their arsenal of available tools.2Pediatric hospitals in particular encounter unique challenges relating to pharmaceutical and supply shortages, which can cause delays in vital patient procedures, alterations in care protocols, and even unexpected changes in care locations.Historically, supply chain operations have been improved by using physical automation technologies and methodologies such as robotic process automation.",
      "LLMs can theoretically be used to help improve supply chain resilience by simulating disruption scenarios to help generate and improve risk-management plans."
    ],
    "contradiction_pairs": [],
    "contributions": [
      "The pharmaceutical supply chain is growing increasingly intricate, with various challenges arising in drug inventory management, procurement, distribution, and dispensing processes."
    ],
    "evidence_atoms": [
      {
        "atom_type": "claim",
        "confidence": "medium",
        "content": "The pharmaceutical supply chain is growing increasingly intricate, with various challenges arising in drug inventory management, procurement, distribution, and dispensing processes.",
        "locator": "abstract",
        "provenance_snippet": "The pharmaceutical supply chain is growing increasingly intricate, with various challenges arising in drug inventory management, procurement, distribution, and dispensing processes",
        "source_ref": "https://doi.org/10.5863/1551-6776-29.2.200"
      },
      {
        "atom_type": "procedure",
        "confidence": "medium",
        "content": "Issues such as shortage mitigation, inconsistency in regulatory compliance, cost control, cold-chain storage, adaptation to technologic advances, and secure information sharing all pose significant difficulties to pharmaceutical supply chain management.1 Consequently, this adds immense pressure on health system pharmacies to enhance their arsenal of available tools.2Pediatric hospitals in particular encounter unique challenges relating to pharmaceutical and supply shortages, which can cause delays in vital patient procedures, alterations in care protocols, and even unexpected changes in care locations.Historically, supply chain operations have been improved by using physical automation technologies and methodologies such as robotic process automation.",
        "locator": "abstract",
        "provenance_snippet": "Issues such as shortage mitigation, inconsistency in regulatory compliance, cost control, cold-chain storage, adaptation to technologic advances, and secure information sharing all",
        "source_ref": "https://doi.org/10.5863/1551-6776-29.2.200"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_02cef4ce5d4a",
    "key_equations": [],
    "limitations": [
      "The pharmaceutical supply chain is growing increasingly intricate, with various challenges arising in drug inventory management, procurement, distribution, and dispensing processes.",
      "Issues such as shortage mitigation, inconsistency in regulatory compliance, cost control, cold-chain storage, adaptation to technologic advances, and secure information sharing all pose significant difficulties to pharmaceutical supply chain management.1 Consequently, this adds immense pressure on health system pharmacies to enhance their arsenal of available tools.2Pediatric hospitals in particular encounter unique challenges relating to pharmaceutical and supply shortages, which can cause delays in vital patient procedures, alterations in care protocols, and even unexpected changes in care locations.Historically, supply chain operations have been improved by using physical automation technologies and methodologies such as robotic process automation.",
      "Efficient operations use business intelligence to drive informed decisions regarding sourcing, pricing, and patient care, which can be facilitated either by internal teams or through vendor-provided solutions.3 While this approach does mitigate several problems, it rarely results in holistic problem resolution\u2014like a chronic illness, these issues wax and wane.Addressing these supply chain challenges is crucial to ensure that patients receive the most optimal care available.3 Emerging technologies like large language models (LLMs) and generative artificial intelligence (AI) present new opportunities to address these supply chain challenges."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "paper",
    "summary": "The pharmaceutical supply chain is growing increasingly intricate, with various challenges arising in drug inventory management, procurement, distribution, and dispensing processes. Issues such as shortage mitigation, inconsistency in regulatory compliance, cost control, cold-chain storage, adaptation to technologic advances, and secure information sharing all pose significant difficulties to pharmaceutical supply chain management.1 Consequently, this adds immense pressure on health system pharmacies to enhance their arsenal of available tools.2Pediatric hospitals in particular encounter unique challenges relating to pharmaceutical and supply shortages, which can cause delays in vital patient procedures, alterations in care protocols, and even unexpected changes in care locations.Historically, supply chain operations have been improved by using physical automation technologies and methodologies such as robotic process automation.",
    "theorem_proof_scaffolds": [],
    "title": "The Potential Application of Large Language Models in Pharmaceutical Supply Chain Management",
    "url": "https://doi.org/10.5863/1551-6776-29.2.200",
    "venue": "",
    "year": 2024
  },
  {
    "assumptions": [
      "Configured models/apis are available."
    ],
    "authors": [
      "ExtensityAI"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "https://github.com/ExtensityAI/benchmark",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Provides benchmark task execution scripts and config flow."
    ],
    "evidence_atoms": [
      {
        "atom_type": "procedure",
        "confidence": "high",
        "content": "Run test.py with task flags to execute benchmark suite.",
        "locator": "README usage",
        "provenance_snippet": "python test.py --context_associations ...",
        "source_ref": "https://github.com/ExtensityAI/benchmark"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_2d30e7648342",
    "key_equations": [],
    "limitations": [
      "Results vary with model provider/version choices."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "code",
    "summary": "Benchmark harness for SymbolicAI workflow evaluation with task flags and config artifacts.",
    "theorem_proof_scaffolds": [],
    "title": "ExtensityAI/benchmark",
    "url": "https://github.com/ExtensityAI/benchmark",
    "venue": "",
    "year": 2026
  },
  {
    "assumptions": [
      "Configured model backends are available."
    ],
    "authors": [
      "ExtensityAI"
    ],
    "baseline_details": [],
    "bibtex": "",
    "citation": "https://github.com/ExtensityAI/symbolicai",
    "claims": [],
    "comparator_lineage": [],
    "conclusions": [],
    "contradiction_pairs": [],
    "contributions": [
      "Provides runnable framework implementation and examples."
    ],
    "evidence_atoms": [
      {
        "atom_type": "procedure",
        "confidence": "high",
        "content": "Install and configure engines before running examples/tests.",
        "locator": "README",
        "provenance_snippet": "pip install symbolicai[all] ... configure engines",
        "source_ref": "https://github.com/ExtensityAI/symbolicai"
      }
    ],
    "extraction_completeness": "partial",
    "extraction_confidence": "medium",
    "future_work": [],
    "id": "src_182ca0e3292b",
    "key_equations": [],
    "limitations": [
      "External model/API dependencies impact reproducibility."
    ],
    "notation_seeds": [],
    "parameters": [],
    "procedures": [],
    "provenance_notes": [],
    "reusable_limitations": [],
    "source_type": "code",
    "summary": "Reference implementation of SymbolicAI primitives/contracts/engines with docs, tests, and packaging manifests.",
    "theorem_proof_scaffolds": [],
    "title": "ExtensityAI/symbolicai",
    "url": "https://github.com/ExtensityAI/symbolicai",
    "venue": "",
    "year": 2026
  },
  {
    "citation": "phase_outputs/knowledge_acquisition.json",
    "source_type": "other",
    "summary": "The full authoritative source list (60 records, one per canonical URL) is persisted in phase_outputs/knowledge_acquisition.json and mirrored in knowledge/refs.jsonl and knowledge/source_index.json.",
    "title": "Authoritative corpus persisted in workspace canonical artifacts",
    "url": "phase_outputs/knowledge_acquisition.json"
  }
]