\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rankuri
3 4 yahoo.com
4 5 baidu.com
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ " rank uri\n", "0 1 facebook.com\n", "1 2 google.com\n", "2 3 youtube.com\n", "3 4 yahoo.com\n", "4 5 baidu.com" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "# Okay for this exercise we need the 2LD and nothing else\n", "import tldextract\n", "\n", "def domain_extract(uri):\n", " ext = tldextract.extract(uri)\n", " if (not ext.suffix):\n", " return np.nan\n", " else:\n", " return ext.domain\n", "\n", "alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]\n", "del alexa_dataframe['rank']\n", "del alexa_dataframe['uri']\n", "alexa_dataframe.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domain
3 yahoo
4 baidu
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ " domain\n", "0 facebook\n", "1 google\n", "2 youtube\n", "3 yahoo\n", "4 baidu" ] } ], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "alexa_dataframe.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domain
99995 rhbabyandchild
99996 rm
99997 sat1
99998 nahimunkar
99999 musi
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 48, "text": [ " domain\n", "99995 rhbabyandchild\n", "99996 rm\n", "99997 sat1\n", "99998 nahimunkar\n", "99999 musi" ] } ], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "# It's possible we have NaNs from blanklines or whatever\n", "alexa_dataframe = alexa_dataframe.dropna()\n", "alexa_dataframe = alexa_dataframe.drop_duplicates()\n", "\n", "# Set the class\n", "alexa_dataframe['class'] = 'legit'\n", "\n", "# Shuffle the data (important for training/testing)\n", "alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))\n", "alexa_total = alexa_dataframe.shape[0]\n", "print 'Total Alexa domains %d' % alexa_total\n", "\n", "# Hold out 10%\n", "hold_out_alexa = alexa_dataframe[alexa_total*.9:]\n", "alexa_dataframe = alexa_dataframe[:alexa_total*.9]\n", "\n", "print 'Number of Alexa domains: %d' % alexa_dataframe.shape[0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Total Alexa domains 91712\n", "Number of Alexa domains: 82540\n" ] } ], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "alexa_dataframe.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclass
20904 transworld legit
82690 lkfun legit
85167 islam2all legit
62859 pulitzer legit
85573 sge legit
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 50, "text": [ " domain class\n", "20904 transworld legit\n", "82690 lkfun legit\n", "85167 islam2all legit\n", "62859 pulitzer legit\n", "85573 sge legit" ] } ], "prompt_number": 50 }, { "cell_type": "code", "collapsed": false, "input": [ "# Read in the DGA domains\n", "dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8')\n", "\n", "# We noticed that the blacklist values just differ by captilization or .com/.org/.info\n", "dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())\n", "del dga_dataframe['raw_domain']\n", "\n", "# It's possible we have NaNs from blanklines or whatever\n", "dga_dataframe = dga_dataframe.dropna()\n", "dga_dataframe = dga_dataframe.drop_duplicates()\n", "dga_total = dga_dataframe.shape[0]\n", "print 'Total DGA domains %d' % dga_total\n", "\n", "# Set the class\n", "dga_dataframe['class'] = 'dga'\n", "\n", "# Hold out 10%\n", "hold_out_dga = dga_dataframe[dga_total*.9:]\n", "dga_dataframe = dga_dataframe[:dga_total*.9]\n", "\n", "print 'Number of DGA domains: %d' % dga_dataframe.shape[0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Total DGA domains 2664\n", "Number of DGA domains: 2397\n" ] } ], "prompt_number": 51 }, { "cell_type": "code", "collapsed": false, "input": [ "dga_dataframe.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclass
0 04055051be412eea5a61b7da8438be3d dga
1 1cb8a5f36f dga
2 30acd347397c34fc273e996b22951002 dga
3 336c986a284e2b3bc0f69f949cb437cb dga
5 40a43e61e56a5c218cf6c22aca27f7ee dga
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 52, "text": [ " domain class\n", "0 04055051be412eea5a61b7da8438be3d dga\n", "1 1cb8a5f36f dga\n", "2 30acd347397c34fc273e996b22951002 dga\n", "3 336c986a284e2b3bc0f69f949cb437cb dga\n", "5 40a43e61e56a5c218cf6c22aca27f7ee dga" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "# Concatenate the domains in a big pile!\n", "all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "# Add a length field for the domain\n", "all_domains['length'] = [len(x) for x in all_domains['domain']]\n", "\n", "# Okay since we're trying to detect dynamically generated domains and short\n", "# domains (length <=6) are crazy random even for 'legit' domains we're going\n", "# to punt on short domains (perhaps just white/black list for short domains?)\n", "all_domains = all_domains[all_domains['length'] > 6]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "# Grabbed this from Rosetta Code (rosettacode.org)\n", "import math\n", "from collections import Counter\n", " \n", "def entropy(s):\n", " p, lns = Counter(s), float(len(s))\n", " return -sum( count/lns * math.log(count/lns, 2) for count in p.values())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "# Add a entropy field for the domain\n", "all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "all_domains.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropy
0 transworld legit 10 3.121928
2 islam2all legit 9 2.419382
3 pulitzer legit 8 3.000000
6 danarimedia legit 11 2.663533
7 heartbreakers legit 13 2.815072
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 57, "text": [ " domain class length entropy\n", "0 transworld legit 10 3.121928\n", "2 islam2all legit 9 2.419382\n", "3 pulitzer legit 8 3.000000\n", "6 danarimedia legit 11 2.663533\n", "7 heartbreakers legit 13 2.815072" ] } ], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [ "all_domains.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropy
84932 ulxxqduryvv dga 11 2.913977
84933 ummvzhin dga 8 2.750000
84934 umsgnwgc dga 8 2.750000
84935 umzsbhpkrgo dga 11 3.459432
84936 umzuyjrfwyf dga 11 2.913977
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropy
29392 theukwebdesigncompany legit 21 4.070656
37378 texaswithlove1982-amomentlikethis legit 33 4.051822
55073 congresomundialjjrperu2009 legit 26 4.056021
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 61, "text": [ " domain class length entropy\n", "29392 theukwebdesigncompany legit 21 4.070656\n", "37378 texaswithlove1982-amomentlikethis legit 33 4.051822\n", "55073 congresomundialjjrperu2009 legit 26 4.056021" ] } ], "prompt_number": 61 }, { "cell_type": "code", "collapsed": false, "input": [ "high_entropy_domains[high_entropy_domains['class']=='dga'].head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropy
82558 a17btkyb38gxe41pwd50nxmzjxiwjwdwfrp52 dga 37 4.540402
82559 a17c49l68ntkqnuhvkrmyb28fubvn30e31g43dq dga 39 4.631305
82560 a17d60gtnxk47gskti15izhvlviyksh64nqkz dga 37 4.270132
82561 a17erpzfzh64c69csi35bqgvp52drita67jzmy dga 38 4.629249
82562 a17fro51oyk67b18ksfzoti55j36p32o11fvc29cr dga 41 4.305859
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
word
37 a
48 aa
51 aaa
53 aaaa
54 aaaaaa
55 aaal
56 aaas
57 aaberg
58 aachen
59 aae
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_grams
0 transworld legit 10 3.121928 39.051439 44.033642
2 islam2all legit 9 2.419382 15.475215 17.367964
3 pulitzer legit 8 3.000000 14.458222 28.441721
6 danarimedia legit 11 2.663533 40.189599 54.829856
7 heartbreakers legit 13 2.815072 45.354321 69.734483
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 75, "text": [ " domain class length entropy alexa_grams word_grams\n", "0 transworld legit 10 3.121928 39.051439 44.033642\n", "2 islam2all legit 9 2.419382 15.475215 17.367964\n", "3 pulitzer legit 8 3.000000 14.458222 28.441721\n", "6 danarimedia legit 11 2.663533 40.189599 54.829856\n", "7 heartbreakers legit 13 2.815072 45.354321 69.734483" ] } ], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [ "all_domains.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_grams
84932 ulxxqduryvv dga 11 2.913977 3.745231 6.464859
84933 ummvzhin dga 8 2.750000 6.183945 7.180022
84934 umsgnwgc dga 8 2.750000 3.272306 3.847079
84935 umzsbhpkrgo dga 11 3.459432 1.653213 2.546543
84936 umzuyjrfwyf dga 11 2.913977 0.000000 0.000000
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 76, "text": [ " domain class length entropy alexa_grams word_grams\n", "84932 ulxxqduryvv dga 11 2.913977 3.745231 6.464859\n", "84933 ummvzhin dga 8 2.750000 6.183945 7.180022\n", "84934 umsgnwgc dga 8 2.750000 3.272306 3.847079\n", "84935 umzsbhpkrgo dga 11 3.459432 1.653213 2.546543\n", "84936 umzuyjrfwyf dga 11 2.913977 0.000000 0.000000" ] } ], "prompt_number": 76 }, { "cell_type": "code", "collapsed": false, "input": [ "# Use the vectorized operations of the dataframe to investigate differences\n", "# between the alexa and word grams\n", "all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']\n", "all_domains.sort(['diff'], ascending=True).head(10)\n", "\n", "# The table below shows those domain names that are more 'dictionary' and less 'web'" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramsdiff
63819 bipolardisorderdepressionanxiety legit 32 3.616729 115.885999 193.844156-77.958157
34524 stirringtroubleinternationally legit 30 3.481728 131.209086 207.204729-75.995643
63954 americansforresponsiblesolutions legit 32 3.667838 145.071369 218.363956-73.292587
49070 channel4embarrassingillnesses legit 29 3.440070 98.201709 169.721499-71.519790
5902 pragmatismopolitico legit 19 3.326360 59.877723 121.536223-61.658500
49210 egaliteetreconciliation legit 23 3.186393 92.257111 152.125325-59.868214
74130 interoperabilitybridges legit 23 3.588354 93.803640 153.626312-59.822673
36976 foreclosurephilippines legit 22 3.447402 72.844280 132.514638-59.670358
47055 corazonindomablecapitulos legit 25 3.813661 74.706878 133.762750-59.055872
70113 annamalicesissyselfhypnosis legit 27 3.429908 68.066490 126.667692-58.601201
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 77, "text": [ " domain class length entropy alexa_grams \\\n", "63819 bipolardisorderdepressionanxiety legit 32 3.616729 115.885999 \n", "34524 stirringtroubleinternationally legit 30 3.481728 131.209086 \n", "63954 americansforresponsiblesolutions legit 32 3.667838 145.071369 \n", "49070 channel4embarrassingillnesses legit 29 3.440070 98.201709 \n", "5902 pragmatismopolitico legit 19 3.326360 59.877723 \n", "49210 egaliteetreconciliation legit 23 3.186393 92.257111 \n", "74130 interoperabilitybridges legit 23 3.588354 93.803640 \n", "36976 foreclosurephilippines legit 22 3.447402 72.844280 \n", "47055 corazonindomablecapitulos legit 25 3.813661 74.706878 \n", "70113 annamalicesissyselfhypnosis legit 27 3.429908 68.066490 \n", "\n", " word_grams diff \n", "63819 193.844156 -77.958157 \n", "34524 207.204729 -75.995643 \n", "63954 218.363956 -73.292587 \n", "49070 169.721499 -71.519790 \n", "5902 121.536223 -61.658500 \n", "49210 152.125325 -59.868214 \n", "74130 153.626312 -59.822673 \n", "36976 132.514638 -59.670358 \n", "47055 133.762750 -59.055872 \n", "70113 126.667692 -58.601201 " ] } ], "prompt_number": 77 }, { "cell_type": "code", "collapsed": false, "input": [ "all_domains.sort(['diff'], ascending=False).head(50)\n", "\n", "# The table below shows those domain names that are more 'web' and less 'dictionary'\n", "# Good O' web...." ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramsdiff
22647 gay-sex-pics-porn-pictures-gay-sex-porn-gay-se... legit 56 3.661056 160.035734 85.124184 74.911550
44091 article-directory-free-submission-free-content legit 46 3.786816 233.518879 188.230453 45.288426
63865 stream-free-movies-online legit 25 3.509275 118.944026 74.496915 44.447110
38570 top-bookmarking-site-list legit 25 3.723074 117.162056 74.126061 43.035995
79963 best-online-shopping-site legit 25 3.452879 122.152194 79.596640 42.555554
12532 watch-free-movie-online legit 23 3.708132 101.010995 58.943451 42.067543
30198 free-online-directory legit 21 3.403989 122.359797 80.735030 41.624767
40859 free-links-articles-directory legit 29 3.702472 152.063809 110.955361 41.108448
30875 online-web-directory legit 20 3.584184 114.439863 74.082948 40.356915
79001 web-directory-online legit 20 3.584184 114.313583 74.082948 40.230634
78947 movie-news-online legit 17 3.175123 81.036910 41.705735 39.331174
51532 xxx-porno-sexvideos legit 19 3.260828 73.025165 35.176549 37.848617
42200 free-tv-video-online legit 20 3.284184 83.341214 45.662984 37.678230
40771 freegamesforyourwebsite legit 23 3.551191 114.291735 78.515881 35.775855
58275 free-web-mobile-themes legit 22 3.356492 88.503556 54.149725 34.353831
70724 seowebdirectoryonline legit 21 3.499228 126.111921 91.819498 34.292423
18894 web-link-directory-site legit 23 3.729446 102.993078 69.367186 33.625893
4838 the-web-directory legit 17 3.454822 87.520339 54.697986 32.822353
65871 social-bookmarking-site legit 23 3.762267 116.664791 84.545021 32.119769
21743 free-links-directory legit 20 3.646439 104.050046 71.956644 32.093402
74449 money-news-online legit 17 3.101881 77.587799 45.775375 31.812424
48456 free-sexvideosfc2 legit 17 3.381580 63.659477 31.878432 31.781045
57427 your-new-directory-site legit 23 3.555533 99.130671 67.468067 31.662605
49041 addsiteurlfreewebdirectory legit 26 3.609496 134.446230 103.178748 31.267482
34821 own-free-website legit 16 3.250000 59.564153 28.839294 30.724859
10080 web-directory-plus legit 18 3.836592 89.030979 58.484138 30.546841
43762 web-directory-sites legit 19 3.471354 98.528255 68.088416 30.439839
34811 free-sex-for-you legit 16 3.030639 46.653059 16.670504 29.982555
21390 online-deal-coupon legit 18 3.308271 77.862004 47.886115 29.975889
48204 acme-people-search-forum legit 24 3.553509 87.829242 57.898987 29.930255
73304 free-webdirectory legit 17 3.337175 93.606205 63.858372 29.747833
44221 good-web-directory legit 18 3.461320 88.201881 58.629789 29.572091
57095 free-link-directory legit 19 3.536887 95.869062 66.507042 29.362020
58652 global-web-directory legit 20 3.721928 100.465474 71.293587 29.171887
74259 online-games-zone legit 17 3.292770 74.987811 45.881826 29.105985
77290 us-web-directory legit 16 3.625000 80.044863 50.969551 29.075312
72128 bookmarking-sites-lists legit 23 3.621176 115.664939 86.595393 29.069546
64948 web-marketing-directory legit 23 3.849224 125.587313 96.714227 28.873086
79557 freewebdirectory101 legit 19 3.471354 100.131488 71.474824 28.656664
72737 free-seo-news legit 13 2.777363 45.267539 17.089020 28.178520
53449 website-traffic-hog legit 19 3.721612 77.199578 49.156126 28.043452
50837 myonlinewebdirectory legit 20 3.584184 121.155376 93.276322 27.879054
29303 business-web-directorys legit 23 3.621176 125.854338 98.160126 27.694212
41310 free-online-submission legit 22 3.413088 113.459411 85.792712 27.666699
76645 linkdirectoryonline legit 19 3.326360 116.879367 89.392747 27.486621
30430 online-deal-site legit 16 3.202820 68.103656 40.887484 27.216172
27227 free-site-submit legit 16 3.202820 64.158023 37.127294 27.030729
62951 mybusiness-web-directory legit 24 3.772055 124.553982 97.538670 27.015312
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramsdiff
3429 dftc777 legit 7 2.128085 2.707570 0 2.707570
3715 5221766 legit 7 2.235926 0.000000 0 0.000000
4144 28365365 legit 8 2.250000 4.050612 0 4.050612
4235 mm-mm-mm legit 8 0.811278 4.260668 0 4.260668
4297 fzzfgjj legit 7 1.950212 0.954243 0 0.954243
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 82, "text": [ " domain class length entropy alexa_grams word_grams diff\n", "3429 dftc777 legit 7 2.128085 2.707570 0 2.707570\n", "3715 5221766 legit 7 2.235926 0.000000 0 0.000000\n", "4144 28365365 legit 8 2.250000 4.050612 0 4.050612\n", "4235 mm-mm-mm legit 8 0.811278 4.260668 0 4.260668\n", "4297 fzzfgjj legit 7 1.950212 0.954243 0 0.954243" ] } ], "prompt_number": 82 }, { "cell_type": "code", "collapsed": false, "input": [ "# Okay these look kinda weird, lets use some nice Pandas functionality\n", "# to look at some statistics around our new features.\n", "all_domains[all_domains['class']=='legit'].describe()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lengthentropyalexa_gramsword_gramsdiff
count 60897.000000 60897.000000 60897.000000 60897.000000 60897.000000
mean 10.873032 2.930306 33.083440 40.901852 -7.818413
std 3.393407 0.347134 19.233994 23.302539 9.388916
min 7.000000 -0.000000 0.000000 0.000000 -77.958157
25% 8.000000 2.725481 19.136340 24.056214 -12.938013
50% 10.000000 2.947703 28.703813 36.259089 -7.108820
75% 13.000000 3.169925 42.400101 53.036218 -1.995136
max 56.000000 4.070656 233.518879 233.648571 74.911550
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramsdiff
85 9to5lol legit 7 2.235926 1.991226 2.359835-0.368609
2611 akb48mt legit 7 2.807355 1.301030 1.041393 0.259637
3715 5221766 legit 7 2.235926 0.000000 0.000000 0.000000
4297 fzzfgjj legit 7 1.950212 0.954243 0.000000 0.954243
6045 crx7601 legit 7 2.807355 0.000000 0.000000 0.000000
8531 mw7zrv2 legit 7 2.807355 0.000000 0.000000 0.000000
10802 jmm1818 legit 7 1.950212 0.903090 0.000000 0.903090
11961 qq66699 legit 7 1.556657 1.322219 0.000000 1.322219
13200 twcczhu legit 7 2.521641 1.724276 0.000000 1.724276
13756 hljdns4 legit 7 2.807355 1.724276 0.000000 1.724276
14763 6470355 legit 7 2.521641 0.000000 0.000000 0.000000
17322 d20pfsrd legit 8 2.750000 0.000000 0.000000 0.000000
20591 lgcct27 legit 7 2.521641 1.176091 0.845098 0.330993
23458 jdoqocy legit 7 2.521641 0.000000 2.813581-2.813581
24661 95178114 legit 8 2.405639 1.591065 0.000000 1.591065
24720 ggmmxxoo legit 8 2.000000 1.113943 0.602060 0.511883
26454 ggmm777 legit 7 1.556657 1.477121 0.602060 0.875061
27222 rkg1866 legit 7 2.521641 0.954243 0.000000 0.954243
27676 1616bbs legit 7 1.950212 1.806180 1.322219 0.483961
29142 5278bbs legit 7 2.521641 1.806180 1.322219 0.483961
29551 05tz2e9 legit 7 2.807355 0.000000 0.000000 0.000000
29858 1532777 legit 7 2.128085 1.477121 0.000000 1.477121
30119 5311314 legit 7 1.842371 1.000000 0.000000 1.000000
30290 zzgcjyzx legit 8 2.405639 0.000000 0.000000 0.000000
30739 xn--g5t518j legit 11 3.095795 1.000000 0.000000 1.000000
31465 7210578 legit 7 2.521641 0.903090 0.000000 0.903090
31951 fj96336 legit 7 2.235926 0.000000 0.000000 0.000000
34455 xn--42cgk1gc8crdb1htg3d legit 23 3.849224 1.255273 2.411620-1.156347
35554 720pmkv legit 7 2.807355 0.000000 0.000000 0.000000
36166 d4ffr55 legit 7 2.235926 1.079181 2.260071-1.180890
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 85, "text": [ " domain class length entropy alexa_grams word_grams diff\n", "85 9to5lol legit 7 2.235926 1.991226 2.359835 -0.368609\n", "2611 akb48mt legit 7 2.807355 1.301030 1.041393 0.259637\n", "3715 5221766 legit 7 2.235926 0.000000 0.000000 0.000000\n", "4297 fzzfgjj legit 7 1.950212 0.954243 0.000000 0.954243\n", "6045 crx7601 legit 7 2.807355 0.000000 0.000000 0.000000\n", "8531 mw7zrv2 legit 7 2.807355 0.000000 0.000000 0.000000\n", "10802 jmm1818 legit 7 1.950212 0.903090 0.000000 0.903090\n", "11961 qq66699 legit 7 1.556657 1.322219 0.000000 1.322219\n", "13200 twcczhu legit 7 2.521641 1.724276 0.000000 1.724276\n", "13756 hljdns4 legit 7 2.807355 1.724276 0.000000 1.724276\n", "14763 6470355 legit 7 2.521641 0.000000 0.000000 0.000000\n", "17322 d20pfsrd legit 8 2.750000 0.000000 0.000000 0.000000\n", "20591 lgcct27 legit 7 2.521641 1.176091 0.845098 0.330993\n", "23458 jdoqocy legit 7 2.521641 0.000000 2.813581 -2.813581\n", "24661 95178114 legit 8 2.405639 1.591065 0.000000 1.591065\n", "24720 ggmmxxoo legit 8 2.000000 1.113943 0.602060 0.511883\n", "26454 ggmm777 legit 7 1.556657 1.477121 0.602060 0.875061\n", "27222 rkg1866 legit 7 2.521641 0.954243 0.000000 0.954243\n", "27676 1616bbs legit 7 1.950212 1.806180 1.322219 0.483961\n", "29142 5278bbs legit 7 2.521641 1.806180 1.322219 0.483961\n", "29551 05tz2e9 legit 7 2.807355 0.000000 0.000000 0.000000\n", "29858 1532777 legit 7 2.128085 1.477121 0.000000 1.477121\n", "30119 5311314 legit 7 1.842371 1.000000 0.000000 1.000000\n", "30290 zzgcjyzx legit 8 2.405639 0.000000 0.000000 0.000000\n", "30739 xn--g5t518j legit 11 3.095795 1.000000 0.000000 1.000000\n", "31465 7210578 legit 7 2.521641 0.903090 0.000000 0.903090\n", "31951 fj96336 legit 7 2.235926 0.000000 0.000000 0.000000\n", "34455 xn--42cgk1gc8crdb1htg3d legit 23 3.849224 1.255273 2.411620 -1.156347\n", "35554 720pmkv legit 7 2.807355 0.000000 0.000000 0.000000\n", "36166 d4ffr55 legit 7 2.235926 1.079181 2.260071 -1.180890" ] } ], "prompt_number": 85 }, { "cell_type": "code", "collapsed": false, "input": [ "# Epiphany... Alexa really may not be the best 'exemplar' set... \n", "# (probably a no-shit moment for everyone else :)\n", "#\n", "# Discussion: If you're using these as exemplars of NOT DGA, then your probably\n", "# making things very hard on your machine learning algorithm.\n", "# Perhaps we should have two categories of Alexa domains, 'legit'\n", "# and a 'weird'. based on some definition of weird.\n", "# Looking at the entries above... we have approx 80 domains\n", "# that we're going to mark as 'weird'.\n", "#\n", "all_domains.loc[weird_cond, 'class'] = 'weird'\n", "print all_domains['class'].value_counts()\n", "all_domains[all_domains['class'] == 'weird'].head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "legit 60818\n", "dga 2397\n", "weird 79\n", "dtype: int64\n" ] }, { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramsdiff
85 9to5lol weird 7 2.235926 1.991226 2.359835-0.368609
2611 akb48mt weird 7 2.807355 1.301030 1.041393 0.259637
3715 5221766 weird 7 2.235926 0.000000 0.000000 0.000000
4297 fzzfgjj weird 7 1.950212 0.954243 0.000000 0.954243
6045 crx7601 weird 7 2.807355 0.000000 0.000000 0.000000
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_grams
0 alcatelonetouch legit 15 3.106891 49.001768 79.015001
1 optumhealthfinancial legit 20 3.584184 68.667084 87.158661
4 elderscrollsonline legit 18 3.016876 76.441834 94.462092
5 mobango legit 7 2.521641 18.020832 22.072036
6 costaud legit 7 2.807355 16.037393 25.008755
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramspred
896 dom2-fan legit 8 3.000000 6.568955 5.656685 dga
1296 mm8mm8-6642 legit 11 2.368523 0.000000 0.000000 dga
1378 4390208 legit 7 2.521641 0.000000 0.000000 dga
1514 sqrt121 legit 7 2.521641 0.000000 0.000000 dga
1687 02022222222 legit 11 0.684038 0.903090 0.000000 dga
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 96, "text": [ " domain class length entropy alexa_grams word_grams pred\n", "896 dom2-fan legit 8 3.000000 6.568955 5.656685 dga\n", "1296 mm8mm8-6642 legit 11 2.368523 0.000000 0.000000 dga\n", "1378 4390208 legit 7 2.521641 0.000000 0.000000 dga\n", "1514 sqrt121 legit 7 2.521641 0.000000 0.000000 dga\n", "1687 02022222222 legit 11 0.684038 0.903090 0.000000 dga" ] } ], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "misclassified[misclassified['class'] == 'dga'].head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainclasslengthentropyalexa_gramsword_gramspred
9184 usbiezgac dga 9 3.169925 7.825928 9.172547 legit
9185 ushcnewo dga 8 3.000000 12.265642 13.904812 legit
9187 usnspdph dga 8 2.500000 5.182278 6.556287 legit
9190 utamehz dga 7 2.807355 10.741352 14.733893 legit
9192 utfowept dga 8 2.750000 7.095911 17.416355 legit
\n", "