[
  {
    "id": "gptbot",
    "name": "GPTBot",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
    "uaToken": "GPTBot",
    "owner": "OpenAI",
    "purpose": "training",
    "docsUrl": "https://platform.openai.com/docs/gptbot",
    "robotsDirective": "Disallow",
    "notes": "Crawls public web pages to gather data for training OpenAI foundation models. Honors robots.txt."
  },
  {
    "id": "chatgpt-user",
    "name": "ChatGPT-User",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
    "uaToken": "ChatGPT-User",
    "owner": "OpenAI",
    "purpose": "user-agent",
    "docsUrl": "https://platform.openai.com/docs/bots",
    "robotsDirective": "Allow",
    "notes": "On-demand fetcher for user actions in ChatGPT and Custom GPTs (browse, link previews). Allowing it improves citation quality in answers."
  },
  {
    "id": "oai-searchbot",
    "name": "OAI-SearchBot",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot",
    "uaToken": "OAI-SearchBot",
    "owner": "OpenAI",
    "purpose": "search",
    "docsUrl": "https://platform.openai.com/docs/bots",
    "robotsDirective": "Allow",
    "notes": "Indexer powering ChatGPT Search results. Allow to be discoverable in ChatGPT-driven search."
  },
  {
    "id": "claudebot",
    "name": "ClaudeBot",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)",
    "uaToken": "ClaudeBot",
    "owner": "Anthropic",
    "purpose": "training",
    "docsUrl": "https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
    "robotsDirective": "Disallow",
    "notes": "Anthropic's primary crawler. Collects publicly available web content to improve Claude models. Honors robots.txt."
  },
  {
    "id": "anthropic-ai",
    "name": "anthropic-ai",
    "ua": "anthropic-ai",
    "uaToken": "anthropic-ai",
    "owner": "Anthropic",
    "purpose": "training",
    "docsUrl": "https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
    "robotsDirective": "Disallow",
    "notes": "Deprecated identifier still seen in older logs. Anthropic recommends targeting ClaudeBot instead, but keeping the rule does no harm."
  },
  {
    "id": "claude-web",
    "name": "Claude-Web",
    "ua": "Claude-Web",
    "uaToken": "Claude-Web",
    "owner": "Anthropic",
    "purpose": "training",
    "docsUrl": "https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
    "robotsDirective": "Disallow",
    "notes": "Deprecated identifier. Listed for completeness because many robots.txt files in the wild still reference it."
  },
  {
    "id": "perplexitybot",
    "name": "PerplexityBot",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
    "uaToken": "PerplexityBot",
    "owner": "Perplexity",
    "purpose": "search",
    "docsUrl": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers",
    "robotsDirective": "Allow",
    "notes": "Indexer used to surface websites in Perplexity AI search answers. Allow to remain citable in Perplexity."
  },
  {
    "id": "perplexity-user",
    "name": "Perplexity-User",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)",
    "uaToken": "Perplexity-User",
    "owner": "Perplexity",
    "purpose": "user-agent",
    "docsUrl": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers",
    "robotsDirective": "Allow",
    "notes": "On-demand fetcher when a Perplexity user asks a question that requires browsing a specific page. Allow to surface as a cited source."
  },
  {
    "id": "google-extended",
    "name": "Google-Extended",
    "ua": "Google-Extended",
    "uaToken": null,
    "owner": "Google",
    "purpose": "training",
    "docsUrl": "https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers",
    "robotsDirective": "Allow",
    "notes": "Policy-only token (no separate HTTP user-agent). Controls whether Google may use your content to train Gemini and Vertex AI generative APIs. Does not affect Google Search."
  },
  {
    "id": "applebot-extended",
    "name": "Applebot-Extended",
    "ua": "Applebot-Extended",
    "uaToken": null,
    "owner": "Apple",
    "purpose": "training",
    "docsUrl": "https://support.apple.com/en-us/119829",
    "robotsDirective": "Allow",
    "notes": "Policy-only token (no separate HTTP user-agent). Controls whether Apple may use Applebot-crawled content to train Apple Intelligence foundation models. Does not affect Spotlight/Siri search inclusion."
  },
  {
    "id": "bytespider",
    "name": "Bytespider",
    "ua": "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",
    "uaToken": "Bytespider",
    "owner": "ByteDance",
    "purpose": "training",
    "docsUrl": "https://bytespider.bytedance.com/",
    "robotsDirective": "Disallow",
    "notes": "ByteDance's crawler, gathers data for Doubao and other ByteDance AI products. Has been reported to crawl aggressively and to occasionally ignore robots.txt."
  },
  {
    "id": "ccbot",
    "name": "CCBot",
    "ua": "CCBot/2.0 (https://commoncrawl.org/faq/)",
    "uaToken": "CCBot",
    "owner": "Common Crawl",
    "purpose": "training",
    "docsUrl": "https://commoncrawl.org/ccbot",
    "robotsDirective": "Disallow",
    "notes": "Open-data crawler whose dumps are widely used as a training corpus for LLMs. Blocking CCBot transitively reduces inclusion in many third-party training sets."
  },
  {
    "id": "mistralai-user",
    "name": "MistralAI-User",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; MistralAI-User/1.0; +https://docs.mistral.ai/robots)",
    "uaToken": "MistralAI-User",
    "owner": "Mistral AI",
    "purpose": "user-agent",
    "docsUrl": "https://docs.mistral.ai/robots",
    "robotsDirective": "Allow",
    "notes": "On-demand fetcher for Le Chat user actions. Not used for bulk training crawls."
  },
  {
    "id": "duckassistbot",
    "name": "DuckAssistBot",
    "ua": "DuckAssistBot/1.2; (+http://duckduckgo.com/duckassistbot.html)",
    "uaToken": "DuckAssistBot",
    "owner": "DuckDuckGo",
    "purpose": "user-agent",
    "docsUrl": "https://duckduckgo.com/duckduckgo-help-pages/results/duckassistbot",
    "robotsDirective": "Allow",
    "notes": "On-demand fetcher used when a DuckDuckGo user triggers DuckAssist. Does not crawl proactively."
  },
  {
    "id": "meta-externalagent",
    "name": "Meta-ExternalAgent",
    "ua": "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)",
    "uaToken": "meta-externalagent",
    "owner": "Meta",
    "purpose": "training",
    "docsUrl": "https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/",
    "robotsDirective": "Disallow",
    "notes": "Meta's AI training and indexing crawler. Honors robots.txt."
  },
  {
    "id": "facebookbot",
    "name": "FacebookBot",
    "ua": "FacebookBot",
    "uaToken": "FacebookBot",
    "owner": "Meta",
    "purpose": "training",
    "docsUrl": "https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/",
    "robotsDirective": "Disallow",
    "notes": "Meta's crawler for training speech-recognition models on public web content. Distinct from facebookexternalhit (link previews)."
  },
  {
    "id": "amazonbot",
    "name": "Amazonbot",
    "ua": "Mozilla/5.0 (compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36",
    "uaToken": "Amazonbot",
    "owner": "Amazon",
    "purpose": "search",
    "docsUrl": "https://developer.amazon.com/amazonbot",
    "robotsDirective": "Allow",
    "notes": "General Amazon crawler used to improve Alexa answers and Amazon services; Amazon states data may also be used to train AI models."
  },
  {
    "id": "cohere-training-data-crawler",
    "name": "cohere-training-data-crawler",
    "ua": "cohere-training-data-crawler",
    "uaToken": "cohere-training-data-crawler",
    "owner": "Cohere",
    "purpose": "training",
    "docsUrl": "https://docs.cohere.com/docs/cohere-training-data-crawler",
    "robotsDirective": "Disallow",
    "notes": "Collects training data for Cohere's enterprise LLMs. Honors robots.txt."
  },
  {
    "id": "diffbot",
    "name": "Diffbot",
    "ua": "Mozilla/5.0 (compatible; Diffbot/0.1; +http://www.diffbot.com/our-apis/crawler/)",
    "uaToken": "Diffbot",
    "owner": "Diffbot",
    "purpose": "training",
    "docsUrl": "https://docs.diffbot.com/docs/does-crawl-respect-robotstxt",
    "robotsDirective": "Disallow",
    "notes": "Extracts structured data sold to third parties, including for AI training pipelines. Honors robots.txt."
  },
  {
    "id": "petalbot",
    "name": "PetalBot",
    "ua": "Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; PetalBot;+https://webmaster.petalsearch.com/site/petalbot)",
    "uaToken": "PetalBot",
    "owner": "Huawei",
    "purpose": "search",
    "docsUrl": "https://aspiegel.com/petalbot",
    "robotsDirective": "Allow",
    "notes": "Indexer behind Petal Search and Huawei AI assistant Celia. Content reportedly used for AI training too."
  },
  {
    "id": "youbot",
    "name": "YouBot",
    "ua": "Mozilla/5.0 (compatible; YouBot (+http://www.you.com))",
    "uaToken": "YouBot",
    "owner": "You.com",
    "purpose": "search",
    "docsUrl": "https://about.you.com/youbot/",
    "robotsDirective": "Allow",
    "notes": "Indexer for the You.com AI search engine. Allow to remain citable in You.com's grounded answers."
  },
  {
    "id": "timpibot",
    "name": "Timpibot",
    "ua": "Mozilla/5.0 (compatible; Timpibot/0.8; +https://www.timpi.io/)",
    "uaToken": "Timpibot",
    "owner": "Timpi",
    "purpose": "search",
    "docsUrl": "https://www.timpi.io/timpibot",
    "robotsDirective": "Allow",
    "notes": "Decentralized AI search. Lower traffic than the big four but aims to be a third-party citation source."
  },
  {
    "id": "googleother",
    "name": "GoogleOther",
    "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GoogleOther) Chrome/W.X.Y.Z Safari/537.36",
    "uaToken": "GoogleOther",
    "owner": "Google",
    "purpose": "training",
    "docsUrl": "https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers",
    "robotsDirective": "Allow",
    "notes": "Catch-all crawler used by Google product teams for non-Search use cases (R&D, internal tooling, occasional model training). Distinct from Google-Extended which is the policy-only token for generative APIs."
  },
  {
    "id": "omgilibot",
    "name": "Omgilibot",
    "ua": "Omgilibot/0.13 +http://www.omgili.com",
    "uaToken": "Omgili",
    "owner": "Webz.io",
    "purpose": "training",
    "docsUrl": "https://docs.webz.io/docs/our-crawler",
    "robotsDirective": "Disallow",
    "notes": "Crawler behind Webz.io's data feeds, widely sold to LLM training pipelines. Honors robots.txt; the legacy `Omgili` UA is sometimes seen alongside."
  }
]