{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nimport requests as req\\n\\ndef get_plausible_url(domain):\\n yield 'https://www.{}'.format(domain)\\n yield 'https://{}'.format(domain)\\n yield 'http://{}'.format(domain)\\n yield 'http://www.{}'.format(domain)\\n\\nwith open('top10000.txt') as alexa_list:\\n index = 1\\n for website in alexa_list:\\n website = website.strip()\\n\\n for url in get_plausible_url(website):\\n robotstxt_url = '{}/robots.txt'.format(url)\\n print('[{}] downloading {}'.format(index, robotstxt_url))\\n try:\\n response = req.get(robotstxt_url, timeout=5)\\n except:\\n print('[{}] download failed for {}'.format(index, robotstxt_url))\\n continue\\n else:\\n with open('robotstxt_new/{}'.format(website), 'wb') as f:\\n f.write(response.content)\\n print('[{}] written {}'.format(index, robotstxt_url))\\n break\\n\\n index += 1\\n\""
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"import requests as req\n",
"\n",
"def get_plausible_url(domain):\n",
" yield 'https://www.{}'.format(domain)\n",
" yield 'https://{}'.format(domain)\n",
" yield 'http://{}'.format(domain)\n",
" yield 'http://www.{}'.format(domain)\n",
"\n",
"with open('top10000.txt') as alexa_list:\n",
" index = 1\n",
" for website in alexa_list:\n",
" website = website.strip()\n",
"\n",
" for url in get_plausible_url(website):\n",
" robotstxt_url = '{}/robots.txt'.format(url)\n",
" print('[{}] downloading {}'.format(index, robotstxt_url))\n",
" try:\n",
" response = req.get(robotstxt_url, timeout=5)\n",
" except:\n",
" print('[{}] download failed for {}'.format(index, robotstxt_url))\n",
" continue\n",
" else:\n",
" with open('robotstxt_new/{}'.format(website), 'wb') as f:\n",
" f.write(response.content)\n",
" print('[{}] written {}'.format(index, robotstxt_url))\n",
" break\n",
"\n",
" index += 1\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No of websites : 965\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"error occured while parsing beeg.com\n",
"error occured while parsing drive2.ru\n",
"error occured while parsing media.tumblr.com\n",
"error occured while parsing epochtimes.com\n",
"error occured while parsing codepen.io\n",
"error occured while parsing istockphoto.com\n",
"error occured while parsing amazonaws.com\n",
"error occured while parsing myway.com\n",
"error occured while parsing thewhizmarketing.com\n",
"error occured while parsing bancodevenezuela.com\n",
"error occured while parsing billdesk.com\n",
"error occured while parsing ozon.ru\n",
"error occured while parsing banesconline.com\n",
"error occured while parsing leboncoin.fr\n",
"error occured while parsing sportzbonanza.com\n",
"error occured while parsing tabelog.com\n",
"error occured while parsing ltn.com.tw\n",
"error occured while parsing taleo.net\n",
"error occured while parsing bet9ja.com\n",
"error occured while parsing aparat.com\n",
"error occured while parsing yy.com\n",
"error occured while parsing alipay.com\n",
"error occured while parsing jqw.com\n",
"error occured while parsing jd.hk\n",
"error occured while parsing t.me\n",
"error occured while parsing varzesh3.com\n",
"error occured while parsing wiktionary.org\n",
"error occured while parsing farsnews.com\n",
"error occured while parsing citi.com\n",
"error occured while parsing 126.com\n",
"error occured while parsing nga.cn\n",
"error occured while parsing justdial.com\n",
"error occured while parsing lordfilms.tv\n",
"error occured while parsing kissanime.ru\n",
"error occured while parsing onlinesbi.com\n",
"error occured while parsing telegram.org\n",
"error occured while parsing xinhuanet.com\n",
"error occured while parsing huffpost.com\n",
"error occured while parsing jianshu.com\n",
"error occured while parsing kissasian.sh\n",
"error occured while parsing incometaxindiaefiling.gov.in\n",
"error occured while parsing qualtrics.com\n",
"error occured while parsing gamespot.com\n",
"error occured while parsing zcool.com.cn\n",
"error occured while parsing epfindia.gov.in\n",
"error occured while parsing macys.com\n",
"error occured while parsing notifications.website\n",
"error occured while parsing force.com\n",
"error occured while parsing irctc.co.in\n",
"error occured while parsing nintendo.com\n",
"error occured while parsing investing.com\n",
"error occured while parsing exhentai.org\n",
"error occured while parsing tencent.com\n",
"error occured while parsing uptobox.com\n",
"error occured while parsing ptt.cc\n",
"error occured while parsing rpgmasterleague.com\n",
"error occured while parsing grammarly.com\n",
"error occured while parsing acs.org\n",
"error occured while parsing 178.com\n",
"error occured while parsing w3school.com.cn\n",
"error occured while parsing storiespace.com\n",
"error occured while parsing delta.com\n",
"error occured while parsing eastday.com\n",
"error occured while parsing redfin.com\n",
"error occured while parsing godaddy.com\n",
"error occured while parsing office365.com\n",
"error occured while parsing youdao.com\n",
"error occured while parsing animeflv.net\n",
"error occured while parsing 360.com\n",
"error occured while parsing uidai.gov.in\n",
"error occured while parsing syosetu.com\n",
"error occured while parsing dangdang.com\n",
"error occured while parsing herokuapp.com\n",
"error occured while parsing naukri.com\n",
"error occured while parsing cnzz.com\n",
"error occured while parsing yahoo.co.jp\n",
"error occured while parsing sciencedirect.com\n",
"error occured while parsing mobile01.com\n",
"error occured while parsing caixa.gov.br\n",
"error occured while parsing afreecatv.com\n",
"error occured while parsing sina.com.cn\n",
"error occured while parsing binance.com\n",
"error occured while parsing wikimedia.org\n",
"error occured while parsing wattpad.com\n",
"error occured while parsing abola.pt\n",
"error occured while parsing prnt.sc\n",
"error occured while parsing live.com\n",
"error occured while parsing gmarket.co.kr\n",
"error occured while parsing notify-service.com\n",
"error occured while parsing rednet.cn\n",
"error occured while parsing line.me\n",
"error occured while parsing gamersky.com\n",
"error occured while parsing mercari.com\n",
"error occured while parsing azure.com\n",
"error occured while parsing heroesofrpg.com\n",
"error occured while parsing zhaopin.com\n",
"error occured while parsing yespornplease.com\n",
"error occured while parsing akoam.net\n",
"error occured while parsing zhibo8.cc\n",
"error occured while parsing intoday.in\n",
"error occured while parsing jb51.net\n",
"error occured while parsing naver.jp\n",
"error occured while parsing bestbuy.com\n",
"error occured while parsing blog.me\n",
"error occured while parsing getawesome1.com\n",
"error occured while parsing bankmellat.ir\n",
"error occured while parsing crptgate.com\n",
"error occured while parsing panda.tv\n",
"error occured while parsing dspmulti.com\n",
"error occured while parsing myshopify.com\n",
"error occured while parsing hatenablog.com\n",
"error occured while parsing fc2.com\n",
"error occured while parsing japanpost.jp\n",
"error occured while parsing patria.org.ve\n",
"error occured while parsing joins.com\n",
"error occured while parsing jooble.org\n",
"error occured while parsing poe.trade\n",
"error occured while parsing asos.com\n",
"error occured while parsing canada.ca\n",
"error occured while parsing brilio.net\n",
"error occured while parsing drudgereport.com\n",
"error occured while parsing aliyun.com\n",
"error occured while parsing myanmarload.com\n",
"error occured while parsing book118.com\n",
"error occured while parsing wikipedia.org\n",
"error occured while parsing pchome.com.tw\n",
"error occured while parsing mayoclinic.org\n",
"error occured while parsing caijing.com.cn\n",
"error occured while parsing hh.ru\n",
"error occured while parsing siteadvisor.com\n",
"error occured while parsing accuweather.com\n",
"error occured while parsing hp.com\n",
"error occured while parsing office.com\n"
]
}
],
"source": [
"from os import listdir\n",
"from os.path import isfile, join\n",
"import pandas as pd\n",
"from termcolor import colored\n",
"import sys\n",
"\n",
"PATH_TO_DIR = './robotstxt'\n",
"files = (f for f in listdir(PATH_TO_DIR) if isfile(join(PATH_TO_DIR, f)))\n",
"\n",
"user_agents = set()\n",
"directive_count = {}\n",
"count_per_website = {}\n",
"\n",
"def parse_robotstxt(content):\n",
" directives_on_this_website = set()\n",
" \n",
" lines = content.split('\\n')\n",
" for line in lines:\n",
" hash_pos = line.find('#')\n",
" if hash_pos != -1:\n",
" line = line[0: hash_pos].strip()\n",
"\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
"\n",
" line = line.strip()\n",
" field, value = line.split(':', 1)\n",
" field = field.strip().lower()\n",
" value = value.strip()\n",
"\n",
" if field not in directive_count:\n",
" directive_count[field] = 0\n",
" directive_count[field] += 1\n",
" \n",
" if field not in directives_on_this_website:\n",
" directives_on_this_website.add(field)\n",
" if field not in count_per_website:\n",
" count_per_website[field] = 0\n",
" count_per_website[field] += 1\n",
" \n",
" if field == 'user-agent':\n",
" user_agents.add(value)\n",
"\n",
" \n",
"website_count = 0\n",
"for filename in files:\n",
" with open('robotstxt/{}'.format(filename), 'r') as f:\n",
" website_count += 1\n",
" try:\n",
" content = f.read()\n",
" parse_robotstxt(content)\n",
" except:\n",
" print(\"error occured while parsing {}\".format(filename), file=sys.stderr)\n",
"print(\"No of websites : {}\".format(website_count))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" User Agents (Distinct)\n",
"0 sitecheck.internetseer.com\n",
"1 Bookmark search tool\n",
"2 Googlebot-Image\n",
"3 Telesoft\n",
"4 libwww\n",
"5 gsa-crawler\n",
"6 OmtrBot/1.0\n",
"7 Xenu Link Sleuth/1.3.8\n",
"8 WebBandit\n",
"9 TelegramBot\n",
"10 proxem\n",
"11 BingPreview\n",
"12 archive.org\n",
"13 Baiduspider-video\n",
"14 Teleport\n",
"15 NetResearchServer*\n",
"16 AmiSoftware\n",
"17 BunnySlippers\n",
"18 Pipl\n",
"19 mozDex*\n",
"20 AdsBot-Google-Mobile\n",
"21 wotbox\n",
"22 008\n",
"23 aibang-bot Disallow: /\n",
"24 grapeshot\n",
"25 Niki-Bot\n",
"26 ScoutJet\n",
"27 www.aibang.com Disallow: /\n",
"28 YaDirectFetcher\n",
"29 coccoc\n",
"30 WeSEE_Bot\n",
"31 Yandex\n",
"32 NICErsPRO\n",
"33 linko\n",
"34 naverbot\n",
"35 gsa-crawler-www\n",
"36 Riddlerbot\n",
"37 Naverbot\n",
"38 FAST\n",
"39 msnbot-mobile\n",
"40 Flipboard\n",
"41 Mozilla/4.0 (compatible; MSIE 4.01; Windows NT...\n",
"42 Pinterestbot\n",
"43 Maxthon\n",
"44 Cision\n",
"45 Rome Client (http://tinyurl.com/64t5n) Ver: 0.9\n",
"46 fr_crawler\n",
"47 MIIxpc\n",
"48 Foobot\n",
"49 Bullseye/1.0\n",
"50 Cincobot\n",
"51 Digimind\n",
"52 infoseek\n",
"53 FlipboardProxy\n",
"54 Sogou web spider/4.0\n",
"55 CherryPickerElite/1.0\n",
"56 zoomRank/2.0\n",
"57 alexa site audit\n",
"58 facebookexternalhit\n",
"59 Superfeedr bot/2.0\n",
"60 LinkWalker\n",
"61 Yahoo-MMCrawler\n",
"62 Sogou web spider/3.0\n",
"63 LinkedInBot\n",
"64 JennyBot\n",
"65 vsw\n",
"66 Bitvorebot\n",
"67 Sogou\n",
"68 YodaoBot\n",
"69 Yeti\n",
"70 WebAlta Crawler\n",
"71 auramundi\n",
"72 StackRambler\n",
"73 googlebot-news\n",
"74 DuckDuckbot\n",
"75 Gigabot\n",
"76 Mozilla/5.0 (compatible; Sosospider/2.0; +http...\n",
"77 Googlebot-News\n",
"78 voltron\n",
"79 Rome Client\n",
"80 Openbot\n",
"81 URLy Warning\n",
"82 GetRight/4.2\n",
"83 Uptimebot\n",
"84 BBot\n",
"85 Aqua_Products\n",
"86 looksmart\n",
"87 Mediapartners-Google*\n",
"88 Baiduspider\n",
"89 omgili\n",
"90 KDDI-Googlebot-Mobile\n",
"91 Exabot/3.0\n",
"92 JikeSpider\n",
"93 360spider\n",
"94 googlebot_news\n",
"95 flipboard\n",
"96 Feedly\n",
"97 adequat-systems\n",
"98 sosobot\n",
"99 Screaming Frog SEO Spider\n",
"100 msrbot\n",
"101 NPBot-1/2.0\n",
"102 WikiDo\n",
"103 omgili/0.5 +https://omgili.com\n",
"104 HTTrack\n",
"105 MS Search 4.0 Robot\n",
"106 Baiduspider+(+http://www.baidu.com/search/spid...\n",
"107 BackDoorBot/1.0\n",
"108 HTTrack 3.0\n",
"109 NetAnts\n",
"110 FunWebProducts\n",
"111 BUbiNG\n",
"112 Rambler\n",
"113 Clickagy Intelligence\n",
"114 test-url\n",
"115 bingbot\n",
"116 Baiduspider+\n",
"117 Sogou spider2\n",
"118 Mozilla/5.0 (compatible; heritrix/3.2.0 +http:...\n",
"119 Wget*\n",
"120 httplib\n",
"121 HaosouSpider\n",
"122 omgilibot\n",
"123 psbot\n",
"124 Tailrank\n",
"125 Mozilla/4.0 (compatible; BullsEye; Windows 95)\n",
"126 proximic\n",
"127 Mozilla/4.0 (compatible; MSIE 4.01; Windows NT...\n",
"128 CCBot/2.0 (http://commoncrawl.org/faq/)\n",
"129 YandexOntoDBAPI\n",
"130 MS Search 6.0 Robot\n",
"131 YandexBot\n",
"132 Genieo/1.0\n",
"133 Mata Hari\n",
"134 Zeus 32297 Webster Pro V2.9 Win32\n",
"135 Superfeedr\n",
"136 TurnitinBot*\n",
"137 adequat\n",
"138 yahoo-mmcrawler\n",
"139 BrandONbot\n",
"140 Bingbot\n",
"141 aibangbot Disallow: /\n",
"142 EasouSpider\n",
"143 netEstate NE Crawler\n",
"144 YandexSitelinks\n",
"145 LexiBot\n",
"146 Sogou blog\n",
"147 Ocelli\n",
"148 Flaming AttackBot\n",
"149 lwp-trivial\n",
"150 ZyBORG\n",
"151 Yisouspider\n",
"152 Crescent Internet ToolPak HTTP OLE Control v.1.0\n",
"153 TurnitinBot/1.5\n",
"154 NPBot\n",
"155 trendkite-akashic-crawler\n",
"156 YoudaoBot\n",
"157 Meltawer\n",
"158 VCI WebViewer VCI WebViewer Win32\n",
"159 bender\n",
"160 Sogou Pic Spider/3.0(+http://www.sogou.com/doc...\n",
"161 ichiro/mobile goo\n",
"162 LNSpiderguy\n",
"163 NimbleCrawler\n",
"164 CNCDialer\n",
"165 Botify\n",
"166 adidxbot\n",
"167 Exabot\n",
"168 Youmag\n",
"169 koubei.com Disallow: /\n",
"170 Powermarks\n",
"171 URL_Spider_Pro\n",
"172 MnoGoSearch/*\n",
"173 Mozilla/5.0 (compatible; bnf.fr_bot; +http://w...\n",
"174 Speedy\n",
"175 lwp-trivial/1.34\n",
"176 YandexImages\n",
"177 WebmasterWorld Extractor\n",
"178 discoverybot/2.0\n",
"179 WWW-Collector-E\n",
"180 ConveraCrawler\n",
"181 WebSnake\n",
"182 google-hoteladsverifier\n",
"183 Snarfer/1.0.2\n",
"184 b2w/0.1\n",
"185 spotter\n",
"186 vecteurplus\n",
"187 ProPowerBot/2.14\n",
"188 Nutch\n",
"189 Daumoa\n",
"190 Microsoft URL Control - 6.00.8169\n",
"191 sistrix\n",
"192 5erue\n",
"193 googlebot-mobile\n",
"194 Googlebot-Mobile\n",
"195 STC-bot\n",
"196 Clickagy*\n",
"197 BlogSearch/2 +http://www.icerocket.com/\n",
"198 EmailCollector\n",
"199 ToutiaoSpider\n",
"200 Acunetix Web Vulnerability Scanner\n",
"201 Bender\n",
"202 Augure\n",
"203 moatbot\n",
"204 EmailSiphon\n",
"205 humanlinks\n",
"206 PerMan\n",
"207 Baiduspider-mobile\n",
"208 Archive-It\n",
"209 Sogou News Spider\n",
"210 ia_archiver/1.6\n",
"211 ExtractorPro\n",
"212 BotALot\n",
"213 Xenu's Link Sleuth 1.1c\n",
"214 A6-Indexer/1.0\n",
"215 Wget\n",
"216 exabot\n",
"217 Teoma\n",
"218 EmailWolf\n",
"219 DeepCrawl\n",
"220 Gort\n",
"221 Uniscan\n",
"222 Moreover\n",
"223 URL Control\n",
"224 MauiBot\n",
"225 SearchmetricsBot\n",
"226 PortalBSpider\n",
"227 Sitereport\n",
"228 baiduspider\n",
"229 TheNomad\n",
"230 ChinasoSpider\n",
"231 KSCrawler\n",
"232 NetinfoBot\n",
"233 FairAd Client\n",
"234 DotBot\n",
"235 Yahoo! Slurp\n",
"236 DISCo Pump\n",
"237 SputnikBot\n",
"238 LinkextractorPro\n",
"239 SocSciBot\n",
"240 ArchitextSpider\n",
"241 AhrefsBot\n",
"242 WebSearch*\n",
"243 TightTwatBot\n",
"244 aibang Disallow: /\n",
"245 TeleportPro\n",
"246 Baiduspider/2.0;+http://www.baidu.com/search/s...\n",
"247 Slurp\n",
"248 Orthogaffe\n",
"249 koubeispider Disallow: /\n",
"250 Ezooms\n",
"251 Atomz/1.0\n",
"252 The Intraformant\n",
"253 Download Ninja\n",
"254 yacybot\n",
"255 Xagool\n",
"256 hloader\n",
"257 YisouSpider\n",
"258 Zite\n",
"259 CazoodleBot\n",
"260 WochachaSpider\n",
"261 AdsBot-Google-Mobile-Apps\n",
"262 Szukacz/1.4\n",
"263 Oracle Ultra Search\n",
"264 ZyBorg*\n",
"265 Pu_iN*\n",
"266 trendeo\n",
"267 Spinn3r\n",
"268 AhrefsBots\n",
"269 Mozilla/5.0(compatible; Baiduspider/2.0; +http...\n",
"270 Telefonica\n",
"271 Baiduspider-image+(+http://www.baidu.com/searc...\n",
"272 CompSpyBot\n",
"273 Zealbot\n",
"274 Offline Explorer\n",
"275 grub-client\n",
"276 grub\n",
"277 Facebot\n",
"278 Netvibes\n",
"279 Clickagy Intelligence Bot v2\n",
"280 WebBandit/3.50\n",
"281 BLP_bbot\n",
"282 Microsoft.URL.Control\n",
"283 msnbot\n",
"284 ADmantX\n",
"285 DittoSpyder\n",
"286 Meltwater\n",
"287 Go-http-client\n",
"288 Y!J-MBS/1.0\n",
"289 Alexabot\n",
"290 EdisterBot\n",
"291 discobot\n",
"292 linkfluence\n",
"293 GermCrawler\n",
"294 puf\n",
"295 YandexScreenshotBot\n",
"296 Openfind\n",
"297 PHP\n",
"298 Baiduspider-favo\n",
"299 dotbot/1.0\n",
"300 NetMechanic\n",
"301 True_Robot\n",
"302 test-url/1.0 libwww-perl/5.800\n",
"303 blinkx\n",
"304 Openfind data gatherer\n",
"305 Corporama\n",
"306 VCI\n",
"307 PGBot\n",
"308 WebmasterWorldForumBot\n",
"309 NextGenSearchBot\n",
"310 MIIxpc/4.2\n",
"311 NaverBot\n",
"312 Link*\n",
"313 Scrubby\n",
"314 Sogou web spider\n",
"315 Sindup\n",
"316 FAST Enterprise Crawler 6 / Scirus\n",
"317 Swiftbot\n",
"318 googlebot\n",
"319 CherryPickerSE/1.0\n",
"320 twiceler\n",
"321 HMSE_Robot\n",
"322 YandexSearchShop\n",
"323 Microsoft URL Control - 5.01.4511\n",
"324 OnetSzukaj\n",
"325 Twiceler\n",
"326 spbot\n",
"327 BaiduMobaider\n",
"328 Psbot\n",
"329 vebidoobot\n",
"330 alexabot\n",
"331 trendybuzz\n",
"332 FBSearchBot\n",
"333 360Spider\n",
"334 Libreprensabot/1.0\n",
"335 FAST*\n",
"336 EtaoSpider\n",
"337 Zao\n",
"338 test url\n",
"339 EroCrawler\n",
"340 Newzbin\n",
"341 BlowFish/1.0\n",
"342 searchpreview\n",
"343 Mail.Ru\n",
"344 Webster Pro\n",
"345 Pingdom\n",
"346 CCBot/2.0\n",
"347 SemrushBot-SA\n",
"348 BaiduImagespider\n",
"349 magpie-crawler\n",
"350 Synthesio\n",
"351 Baiduspider-cpro\n",
"352 Baiduspider-ads\n",
"353 Web Image Collector\n",
"354 Freedom\n",
"355 +Baiduspider/2.0\n",
"356 Pinterest\n",
"357 Teleport Pro\n",
"358 CCBot\n",
"359 Qwam content intelligence\n",
"360 RepoMonkey Bait & Tackle/v1.01\n",
"361 wegobot\n",
"362 externalfacebookhit\n",
"363 score3\n",
"364 MegaIndex\n",
"365 smspider\n",
"366 nutch\n",
"367 MJ12bot\n",
"368 GurujiBot\n",
"369 CheeseBot\n",
"370 omgilibot/0.3\n",
"371 uipbot\n",
"372 MagpieRSS\n",
"373 Mozilla/4.0 (compatible; Netcraft Web Server S...\n",
"374 RMA\n",
"375 Yahoo Pipes 1.0\n",
"376 *\n",
"377 DotBot*\n",
"378 turingos\n",
"379 Y!J-SRD/1.0\n",
"380 innosense/Nutch-1.0\n",
"381 WBSearchBot\n",
"382 ContextAd Bot\n",
"383 dotbot\n",
"384 GwdangSpider\n",
"385 Gaisbot\n",
"386 Robozilla\n",
"387 TrustpilotCrawler\n",
"388 YandexMobileBot\n",
"389 Googlebot\n",
"390 TurnitinBot\n",
"391 Seekbot*\n",
"392 WebAuto\n",
"393 wget\n",
"394 duckduckbot\n",
"395 Talkwater\n",
"396 Nigma.ru\n",
"397 Cliqzbot\n",
"398 blinkx_ff_spider\n",
"399 Iron33/1.0.2\n",
"400 Lizard\n",
"401 Baiduspider-news\n",
"402 Scooter*\n",
"403 WebCopier v3.2a\n",
"404 Keyword Density/0.9\n",
"405 www.integromedb.org/Crawler\n",
"406 5emeRue\n",
"407 Relcybot\n",
"408 gigabot\n",
"409 cisco-googlebot-enterprise\n",
"410 ichiro/mobile\n",
"411 Snapbot/1.0\n",
"412 trendictionbot\n",
"413 BLP_bbot/0.1\n",
"414 berlin-fu-cow\n",
"415 Java Browser\n",
"416 DOC\n",
"417 larbin\n",
"418 OrangeBot-Collector\n",
"419 Fast corporate crawler\n",
"420 InfoNaviRobot\n",
"421 Mozilla/5.0 (compatible; Taboolabot/3.7; +http...\n",
"422 googlebot-image\n",
"423 coexel\n",
"424 Microsoft URL Control\n",
"425 IDentity\n",
"426 AdsBot-Google\n",
"427 NerdyBot\n",
"428 linguatools\n",
"429 WebAlta\n",
"430 Balihoo\n",
"431 Acunetix Security Scanner\n",
"432 VoilaBot\n",
"433 aibangspider Disallow: /\n",
"434 Xenu's\n",
"435 ProWebWalker\n",
"436 CRAZYWEBCRAWLER*\n",
"437 ia_archiver\n",
"438 WebEnhancer\n",
"439 Sosospider\n",
"440 careerbot\n",
"441 WebZIP/5.0\n",
"442 SlySearch*\n",
"443 MSIECrawler\n",
"444 Sogou Orion spider\n",
"445 Knowings\n",
"446 Website Quester\n",
"447 YandexMetrika\n",
"448 netseer\n",
"449 Googlebot-image\n",
"450 Web-By-Mail\n",
"451 BecomeBot\n",
"452 Mediapartners-Google\n",
"453 moget/2.1\n",
"454 ADmantX Platform Semantic Analyzer\n",
"455 Browsershots\n",
"456 Fetch\n",
"457 CatchBot\n",
"458 BlogSearch\n",
"459 Charlotte\n",
"460 MoodleBot/1.0\n",
"461 yeti\n",
"462 Ask n read\n",
"463 DuckDuckBot\n",
"464 PiplBot\n",
"465 Adsbot-Google\n",
"466 k2spider\n",
"467 bhcBot\n",
"468 Sogou inst spider\n",
"469 aibang.com Disallow: /\n",
"470 Copernic\n",
"471 trovitBot\n",
"472 leadbox\n",
"473 sitebot\n",
"474 YandexWebmaster\n",
"475 YandexVideoParser\n",
"476 Google-HotelAdsVerifier\n",
"477 suzuran\n",
"478 Jetbot\n",
"479 deepcrawl\n",
"480 YandexNews\n",
"481 ellisphere\n",
"482 +Baiduspider\n",
"483 *;\n",
"484 Solbot\n",
"485 CopyRightCheck\n",
"486 yisouspider\n",
"487 ShopWiki\n",
"488 KaBot\n",
"489 verticalsearch\n",
"490 JobdiggerSpider\n",
"491 magpie-crawler/1.1\n",
"492 Yahoo! Slurp\n",
"493 mytwip\n",
"494 YandexMedianaBot\n",
"495 Zeus Link Scout\n",
"496 YoudaoBot/1.0\n",
"497 moget\n",
"498 SimplePie/1.1.3\n",
"499 WebSauger\n",
"500 YandexAccessibilityBot\n",
"501 BLEXBot\n",
"502 rogerbot\n",
"503 Aboundexbot\n",
"504 psnsearch\n",
"505 WebZip\n",
"506 msnbot-media\n",
"507 psbot/0.1\n",
"508 SWEBot\n",
"509 CrazyWebCrawler-Spider\n",
"510 WebZIP\n",
"511 Adidxbot\n",
"512 nsa\n",
"513 WebVac\n",
"514 WebReaper\n",
"515 spanner\n",
"516 Open*\n",
"517 LinkChecker\n",
"518 Applebot\n",
"519 MaxPointCrawler\n",
"520 Domain Re-Animator Bot\n",
"521 HuihuiSpider\n",
"522 NewsNow\n",
"523 heritrix\n",
"524 libWeb/clsHTTP\n",
"525 Alexibot\n",
"526 WebCopier\n",
"527 Crescent\n",
"528 Owlin Bot\n",
"529 ParadigmCrawler*\n",
"530 Baiduspider-image\n",
"531 seznambot\n",
"532 Radiation Retriever 1.1\n",
"533 SQUID_configured_as_described_at_/help/faq/cache\n",
"534 YandexDirectDyn\n",
"535 Mozilla/5.0 (compatible; Google-Podcast)\n",
"536 SpankBot\n",
"537 OrangeBot\n",
"538 daumoa\n",
"539 mention\n",
"540 Openfind data gathere\n",
"541 Mister PiX\n",
"542 asknread.com\n",
"543 Mozilla/5.0 (compatible; JikeSpider; +http://s...\n",
"544 CherryPicker\n",
"545 WebCapture 2.0\n",
"546 archive.org_bot\n",
"547 toCrawl/UrlDispatcher\n",
"548 SiteArc\n",
"549 vobsub\n",
"550 yahoo-blogs/v3.9\n",
"551 Wget/1.5.3\n",
"552 RepoMonkey\n",
"553 scoop.it\n",
"554 Jakarta Commons-HttpClient/3.1\n",
"555 QueryN Metasearch\n",
"556 google\n",
"557 red-app-gsa-p-one\n",
"558 asterias\n",
"559 Baiduspider/2.0\n",
"560 YandexDirect\n",
"561 tadcrawler\n",
"562 SiteSnagger\n",
"563 Search360-Crawler\n",
"564 True_Robot/1.0\n",
"565 SiteExplorer\n",
"566 Callpod Keeper\n",
"567 cosmos\n",
"568 Krugle\n",
"569 UbiCrawler\n",
"570 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT;...\n",
"571 Xenu\n",
"572 Kraken\n",
"573 WebZip/4.0\n",
"574 endeca\n",
"575 TerrawizBot/1.0\n",
"576 Twitterbot/1.0\n",
"577 IntuitGSACrawler\n",
"578 YadirectBot\n",
"579 ia_archiver-web.archive.org\n",
"580 teoma\n",
"581 +Baiduspider/2.0;++http://www.baidu.com/search...\n",
"582 AdIdxBot\n",
"583 SemrushBot\n",
"584 Googlebot-Video\n",
"585 Laserlikebot\n",
"586 YandexVideo\n",
"587 SandDollar\n",
"588 cis455crawler\n",
"589 Python-urllib\n",
"590 kbcrawl\n",
"591 MSNBot\n",
"592 Wget/1.6\n",
"593 YandexCalendar\n",
"594 Mail.RU_Bot*\n",
"595 BuiltBotTough\n",
"596 test-url/1.0 libwww-perl/5.801\n",
"597 OmniExplorer_Bot\n",
"598 Google-Sitemaps\n",
"599 SeznamBot\n",
"600 Searchie\n",
"601 YahooSeeker/M1A1-R2D2\n",
"602 HaoSouSpider\n",
"603 WebCopier v.2.2\n",
"604 aibang-spider Disallow: /\n",
"605 ChangeDetection\n",
"606 WebZIP/4.21\n",
"607 CrystalSemanticsBot\n",
"608 Harvest/1.5\n",
"609 Zeus\n",
"610 BotRightHere\n",
"611 GoogleBot\n",
"612 test-url/1.0 libwww-perl/5.803\n",
"613 winello\n",
"614 MSNPTC/1.0\n",
"615 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT;...\n",
"616 Twitterbot\n",
"617 larbin*\n",
"618 Mail.RU_Bot\n",
"619 WebStripper\n",
"620 Flipboard/3.2.6 CFNetwork/711.0.6 Darwin/14.0.0\n",
"621 opinion-tracker\n",
"622 Speobot\n",
"623 Kenjin Spider\n",
"624 LinkScan/8.1a Unix\n",
"625 Sogouspider\n",
"626 Wandoujia Spider\n",
"627 Ezooms/1.0\n"
]
}
],
"source": [
"\n",
"print(pd.DataFrame(user_agents, columns=['User Agents (Distinct)']).to_string())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Directive | \n",
" Count | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" user-agent | \n",
" 2917 | \n",
"
\n",
" \n",
" 1 | \n",
" disallow | \n",
" 56575 | \n",
"
\n",
" \n",
" 2 | \n",
" crawl-delay | \n",
" 97 | \n",
"
\n",
" \n",
" 3 | \n",
" sitemap | \n",
" 5210 | \n",
"
\n",
" \n",
" 4 | \n",
" allow | \n",
" 5426 | \n",
"
\n",
" \n",
" 5 | \n",
" clean-param | \n",
" 46 | \n",
"
\n",
" \n",
" 6 | \n",
" host | \n",
" 38 | \n",
"
\n",
" \n",
" 7 | \n",
" <!doctype html><html class=\"en-us no-js \" lan... | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" noindex | \n",
" 502 | \n",
"
\n",
" \n",
" 9 | \n",
" <!doctype html public \"-//w3c//dtd xhtml 1.0 s... | \n",
" 5 | \n",
"
\n",
" \n",
" 10 | \n",
" <html xmlns=\"http | \n",
" 9 | \n",
"
\n",
" \n",
" 11 | \n",
" user-agent | \n",
" 6 | \n",
"
\n",
" \n",
" 12 | \n",
" request-rate | \n",
" 6 | \n",
"
\n",
" \n",
" 13 | \n",
" visit-time | \n",
" 1 | \n",
"
\n",
" \n",
" 14 | \n",
" <html style=\"background | \n",
" 1 | \n",
"
\n",
" \n",
" 15 | \n",
" {\"timestamp\" | \n",
" 1 | \n",
"
\n",
" \n",
" 16 | \n",
" <!-- fd | \n",
" 1 | \n",
"
\n",
" \n",
" 17 | \n",
" <!doctype html public \"-//w3c//dtd xhtml 1.0 t... | \n",
" 4 | \n",
"
\n",
" \n",
" 18 | \n",
" <!doctype html><html><head><title>ign error 40... | \n",
" 1 | \n",
"
\n",
" \n",
" 19 | \n",
" <!doctype html /><html><head><title data-react... | \n",
" 1 | \n",
"
\n",
" \n",
" 20 | \n",
" new date().gettime(),event | \n",
" 1 | \n",
"
\n",
" \n",
" 21 | \n",
" <!doctype html><html><head><title>apache tomca... | \n",
" 1 | \n",
"
\n",
" \n",
" 22 | \n",
" <!doctype html><html><head><meta charset=\"utf-... | \n",
" 1 | \n",
"
\n",
" \n",
" 23 | \n",
" <!doctype html><html class=\"no-js\" lang=\"en\" d... | \n",
" 1 | \n",
"
\n",
" \n",
" 24 | \n",
" @media (min-width | \n",
" 1 | \n",
"
\n",
" \n",
" 25 | \n",
" <!doctype html public \"-//w3c//dtd html 4.01 t... | \n",
" 1 | \n",
"
\n",
" \n",
" 26 | \n",
" disllow | \n",
" 1 | \n",
"
\n",
" \n",
" 27 | \n",
" <!doctype html public \"-//softquad//dtd hotmet... | \n",
" 1 | \n",
"
\n",
" \n",
" 28 | \n",
" <!doctype html> <html lang=\"en\"> <head> <meta ... | \n",
" 1 | \n",
"
\n",
" \n",
" 29 | \n",
" <!doctype html><html lang=\"en\"><head><meta cha... | \n",
" 1 | \n",
"
\n",
" \n",
" 30 | \n",
" var $default_lang=\"\";</script><link rel=\"style... | \n",
" 1 | \n",
"
\n",
" \n",
" 31 | \n",
" <!doctype html public \"-//w3c//dtd xhtml 1.0 t... | \n",
" 1 | \n",
"
\n",
" \n",
" 32 | \n",
" <html xmlns=\"https | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Directive Count\n",
"0 user-agent 2917\n",
"1 disallow 56575\n",
"2 crawl-delay 97\n",
"3 sitemap 5210\n",
"4 allow 5426\n",
"5 clean-param 46\n",
"6 host 38\n",
"7 ign error 40... 1\n",
"19 apache tomca... 1\n",
"22 \n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" Directive | \n",
" Number of Websites | \n",
" % | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" user-agent | \n",
" 786 | \n",
" 81.450777 | \n",
"
\n",
" \n",
" 1 | \n",
" disallow | \n",
" 749 | \n",
" 77.616580 | \n",
"
\n",
" \n",
" 2 | \n",
" crawl-delay | \n",
" 70 | \n",
" 7.253886 | \n",
"
\n",
" \n",
" 3 | \n",
" sitemap | \n",
" 436 | \n",
" 45.181347 | \n",
"
\n",
" \n",
" 4 | \n",
" allow | \n",
" 348 | \n",
" 36.062176 | \n",
"
\n",
" \n",
" 5 | \n",
" clean-param | \n",
" 14 | \n",
" 1.450777 | \n",
"
\n",
" \n",
" 6 | \n",
" host | \n",
" 38 | \n",
" 3.937824 | \n",
"
\n",
" \n",
" 7 | \n",
" <!doctype html><html class=\"en-us no-js \" lan... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 8 | \n",
" noindex | \n",
" 25 | \n",
" 2.590674 | \n",
"
\n",
" \n",
" 9 | \n",
" <!doctype html public \"-//w3c//dtd xhtml 1.0 s... | \n",
" 5 | \n",
" 0.518135 | \n",
"
\n",
" \n",
" 10 | \n",
" <html xmlns=\"http | \n",
" 9 | \n",
" 0.932642 | \n",
"
\n",
" \n",
" 11 | \n",
" user-agent | \n",
" 6 | \n",
" 0.621762 | \n",
"
\n",
" \n",
" 12 | \n",
" request-rate | \n",
" 4 | \n",
" 0.414508 | \n",
"
\n",
" \n",
" 13 | \n",
" visit-time | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 14 | \n",
" <html style=\"background | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 15 | \n",
" {\"timestamp\" | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 16 | \n",
" <!-- fd | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 17 | \n",
" <!doctype html public \"-//w3c//dtd xhtml 1.0 t... | \n",
" 4 | \n",
" 0.414508 | \n",
"
\n",
" \n",
" 18 | \n",
" <!doctype html><html><head><title>ign error 40... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 19 | \n",
" <!doctype html /><html><head><title data-react... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 20 | \n",
" new date().gettime(),event | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 21 | \n",
" <!doctype html><html><head><title>apache tomca... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 22 | \n",
" <!doctype html><html><head><meta charset=\"utf-... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 23 | \n",
" <!doctype html><html class=\"no-js\" lang=\"en\" d... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 24 | \n",
" @media (min-width | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 25 | \n",
" <!doctype html public \"-//w3c//dtd html 4.01 t... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 26 | \n",
" disllow | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 27 | \n",
" <!doctype html public \"-//softquad//dtd hotmet... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 28 | \n",
" <!doctype html> <html lang=\"en\"> <head> <meta ... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 29 | \n",
" <!doctype html><html lang=\"en\"><head><meta cha... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 30 | \n",
" var $default_lang=\"\";</script><link rel=\"style... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 31 | \n",
" <!doctype html public \"-//w3c//dtd xhtml 1.0 t... | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
" 32 | \n",
" <html xmlns=\"https | \n",
" 1 | \n",
" 0.103627 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" Directive Number of Websites \\\n",
"0 user-agent 786 \n",
"1 disallow 749 \n",
"2 crawl-delay 70 \n",
"3 sitemap 436 \n",
"4 allow 348 \n",
"5 clean-param 14 \n",
"6 host 38 \n",
"7 ign error 40... 1 \n",
"19 apache tomca... 1 \n",
"22