[ { "id": "gptbot", "name": "GPTBot", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot", "uaToken": "GPTBot", "owner": "OpenAI", "purpose": "training", "docsUrl": "https://platform.openai.com/docs/gptbot", "robotsDirective": "Disallow", "notes": "Crawls public web pages to gather data for training OpenAI foundation models. Honors robots.txt." }, { "id": "chatgpt-user", "name": "ChatGPT-User", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot", "uaToken": "ChatGPT-User", "owner": "OpenAI", "purpose": "user-agent", "docsUrl": "https://platform.openai.com/docs/bots", "robotsDirective": "Allow", "notes": "On-demand fetcher for user actions in ChatGPT and Custom GPTs (browse, link previews). Allowing it improves citation quality in answers." }, { "id": "oai-searchbot", "name": "OAI-SearchBot", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot", "uaToken": "OAI-SearchBot", "owner": "OpenAI", "purpose": "search", "docsUrl": "https://platform.openai.com/docs/bots", "robotsDirective": "Allow", "notes": "Indexer powering ChatGPT Search results. Allow to be discoverable in ChatGPT-driven search." }, { "id": "claudebot", "name": "ClaudeBot", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)", "uaToken": "ClaudeBot", "owner": "Anthropic", "purpose": "training", "docsUrl": "https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler", "robotsDirective": "Disallow", "notes": "Anthropic's primary crawler. Collects publicly available web content to improve Claude models. Honors robots.txt." }, { "id": "anthropic-ai", "name": "anthropic-ai", "ua": "anthropic-ai", "uaToken": "anthropic-ai", "owner": "Anthropic", "purpose": "training", "docsUrl": "https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler", "robotsDirective": "Disallow", "notes": "Deprecated identifier still seen in older logs. Anthropic recommends targeting ClaudeBot instead, but keeping the rule does no harm." }, { "id": "claude-web", "name": "Claude-Web", "ua": "Claude-Web", "uaToken": "Claude-Web", "owner": "Anthropic", "purpose": "training", "docsUrl": "https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler", "robotsDirective": "Disallow", "notes": "Deprecated identifier. Listed for completeness because many robots.txt files in the wild still reference it." }, { "id": "perplexitybot", "name": "PerplexityBot", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)", "uaToken": "PerplexityBot", "owner": "Perplexity", "purpose": "search", "docsUrl": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers", "robotsDirective": "Allow", "notes": "Indexer used to surface websites in Perplexity AI search answers. Allow to remain citable in Perplexity." }, { "id": "perplexity-user", "name": "Perplexity-User", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)", "uaToken": "Perplexity-User", "owner": "Perplexity", "purpose": "user-agent", "docsUrl": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers", "robotsDirective": "Allow", "notes": "On-demand fetcher when a Perplexity user asks a question that requires browsing a specific page. Allow to surface as a cited source." }, { "id": "google-extended", "name": "Google-Extended", "ua": "Google-Extended", "uaToken": null, "owner": "Google", "purpose": "training", "docsUrl": "https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers", "robotsDirective": "Allow", "notes": "Policy-only token (no separate HTTP user-agent). Controls whether Google may use your content to train Gemini and Vertex AI generative APIs. Does not affect Google Search." }, { "id": "applebot-extended", "name": "Applebot-Extended", "ua": "Applebot-Extended", "uaToken": null, "owner": "Apple", "purpose": "training", "docsUrl": "https://support.apple.com/en-us/119829", "robotsDirective": "Allow", "notes": "Policy-only token (no separate HTTP user-agent). Controls whether Apple may use Applebot-crawled content to train Apple Intelligence foundation models. Does not affect Spotlight/Siri search inclusion." }, { "id": "bytespider", "name": "Bytespider", "ua": "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)", "uaToken": "Bytespider", "owner": "ByteDance", "purpose": "training", "docsUrl": "https://bytespider.bytedance.com/", "robotsDirective": "Disallow", "notes": "ByteDance's crawler, gathers data for Doubao and other ByteDance AI products. Has been reported to crawl aggressively and to occasionally ignore robots.txt." }, { "id": "ccbot", "name": "CCBot", "ua": "CCBot/2.0 (https://commoncrawl.org/faq/)", "uaToken": "CCBot", "owner": "Common Crawl", "purpose": "training", "docsUrl": "https://commoncrawl.org/ccbot", "robotsDirective": "Disallow", "notes": "Open-data crawler whose dumps are widely used as a training corpus for LLMs. Blocking CCBot transitively reduces inclusion in many third-party training sets." }, { "id": "mistralai-user", "name": "MistralAI-User", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; MistralAI-User/1.0; +https://docs.mistral.ai/robots)", "uaToken": "MistralAI-User", "owner": "Mistral AI", "purpose": "user-agent", "docsUrl": "https://docs.mistral.ai/robots", "robotsDirective": "Allow", "notes": "On-demand fetcher for Le Chat user actions. Not used for bulk training crawls." }, { "id": "duckassistbot", "name": "DuckAssistBot", "ua": "DuckAssistBot/1.2; (+http://duckduckgo.com/duckassistbot.html)", "uaToken": "DuckAssistBot", "owner": "DuckDuckGo", "purpose": "user-agent", "docsUrl": "https://duckduckgo.com/duckduckgo-help-pages/results/duckassistbot", "robotsDirective": "Allow", "notes": "On-demand fetcher used when a DuckDuckGo user triggers DuckAssist. Does not crawl proactively." }, { "id": "meta-externalagent", "name": "Meta-ExternalAgent", "ua": "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)", "uaToken": "meta-externalagent", "owner": "Meta", "purpose": "training", "docsUrl": "https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/", "robotsDirective": "Disallow", "notes": "Meta's AI training and indexing crawler. Honors robots.txt." }, { "id": "facebookbot", "name": "FacebookBot", "ua": "FacebookBot", "uaToken": "FacebookBot", "owner": "Meta", "purpose": "training", "docsUrl": "https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/", "robotsDirective": "Disallow", "notes": "Meta's crawler for training speech-recognition models on public web content. Distinct from facebookexternalhit (link previews)." }, { "id": "amazonbot", "name": "Amazonbot", "ua": "Mozilla/5.0 (compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36", "uaToken": "Amazonbot", "owner": "Amazon", "purpose": "search", "docsUrl": "https://developer.amazon.com/amazonbot", "robotsDirective": "Allow", "notes": "General Amazon crawler used to improve Alexa answers and Amazon services; Amazon states data may also be used to train AI models." }, { "id": "cohere-training-data-crawler", "name": "cohere-training-data-crawler", "ua": "cohere-training-data-crawler", "uaToken": "cohere-training-data-crawler", "owner": "Cohere", "purpose": "training", "docsUrl": "https://docs.cohere.com/docs/cohere-training-data-crawler", "robotsDirective": "Disallow", "notes": "Collects training data for Cohere's enterprise LLMs. Honors robots.txt." }, { "id": "diffbot", "name": "Diffbot", "ua": "Mozilla/5.0 (compatible; Diffbot/0.1; +http://www.diffbot.com/our-apis/crawler/)", "uaToken": "Diffbot", "owner": "Diffbot", "purpose": "training", "docsUrl": "https://docs.diffbot.com/docs/does-crawl-respect-robotstxt", "robotsDirective": "Disallow", "notes": "Extracts structured data sold to third parties, including for AI training pipelines. Honors robots.txt." }, { "id": "petalbot", "name": "PetalBot", "ua": "Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; PetalBot;+https://webmaster.petalsearch.com/site/petalbot)", "uaToken": "PetalBot", "owner": "Huawei", "purpose": "search", "docsUrl": "https://aspiegel.com/petalbot", "robotsDirective": "Allow", "notes": "Indexer behind Petal Search and Huawei AI assistant Celia. Content reportedly used for AI training too." }, { "id": "youbot", "name": "YouBot", "ua": "Mozilla/5.0 (compatible; YouBot (+http://www.you.com))", "uaToken": "YouBot", "owner": "You.com", "purpose": "search", "docsUrl": "https://about.you.com/youbot/", "robotsDirective": "Allow", "notes": "Indexer for the You.com AI search engine. Allow to remain citable in You.com's grounded answers." }, { "id": "timpibot", "name": "Timpibot", "ua": "Mozilla/5.0 (compatible; Timpibot/0.8; +https://www.timpi.io/)", "uaToken": "Timpibot", "owner": "Timpi", "purpose": "search", "docsUrl": "https://www.timpi.io/timpibot", "robotsDirective": "Allow", "notes": "Decentralized AI search. Lower traffic than the big four but aims to be a third-party citation source." }, { "id": "googleother", "name": "GoogleOther", "ua": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GoogleOther) Chrome/W.X.Y.Z Safari/537.36", "uaToken": "GoogleOther", "owner": "Google", "purpose": "training", "docsUrl": "https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers", "robotsDirective": "Allow", "notes": "Catch-all crawler used by Google product teams for non-Search use cases (R&D, internal tooling, occasional model training). Distinct from Google-Extended which is the policy-only token for generative APIs." }, { "id": "omgilibot", "name": "Omgilibot", "ua": "Omgilibot/0.13 +http://www.omgili.com", "uaToken": "Omgili", "owner": "Webz.io", "purpose": "training", "docsUrl": "https://docs.webz.io/docs/our-crawler", "robotsDirective": "Disallow", "notes": "Crawler behind Webz.io's data feeds, widely sold to LLM training pipelines. Honors robots.txt; the legacy `Omgili` UA is sometimes seen alongside." } ]