{"cells": [{"metadata": {}, "cell_type": "code", "source": "from nltk.corpus import stopwords \nfrom sklearn.feature_extraction.text import CountVectorizer\nimport pandas as pd\nimport numpy as np", "execution_count": 1, "outputs": []}, {"metadata": {}, "cell_type": "markdown", "source": "## Data Download"}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "import os\n# wget\uc744 \ud65c\uc6a9\ud574\uc11c data \ub2e4\uc6b4\ub85c\ub4dc\n!wget https://.../Downloads.zip\n\nif \"genome\" not in os.listdir():\n os.makedirs(\"genome\")\nelse:\n pass\n\nimport zipfile\nimport shutil\n \ngenome_zip = zipfile.ZipFile('Downloads.zip')\ngenome_zip.extractall('genome')\n \ngenome_zip.close()", "execution_count": null, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "os.listdir('genome')", "execution_count": 3, "outputs": [{"output_type": "execute_result", "execution_count": 3, "data": {"text/plain": "['anatidae.fasta',\n 'gallus_gallus.fasta',\n 'chiropter.fasta',\n 'bos_taurus.fasta',\n 'homo_sapiens.fasta']"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "markdown", "source": "## Data Load & Tokenization"}, {"metadata": {}, "cell_type": "code", "source": "#\uc5fc\uae30\uc11c\uc5f4\uc744 \ud1a0\ud070\ubcc4\ub85c \uc798\ub77c\uc11c \uc218\uce58\ud654\ndef generate_ngrams(s1):\n count_vect = CountVectorizer(lowercase=False, ngram_range=(2,4),analyzer='char')\n X1 = count_vect.fit_transform(s1)\n \n lcount = list()\n lcount = []\n for i in s1:\n count = len(i)\n #print(count)\n lcount.append(count)\n \n count_vect_df = pd.DataFrame(X1.todense(), columns=count_vect.get_feature_names())\n count_vect_df=count_vect_df.apply(lambda x: x / lcount[x.name] ,axis=1)\n\n return count_vect_df", "execution_count": 4, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "#\ud30c\uc77c\ub85c\ubd80\ud130 \uc2a4\ud2b8\ub9c1\uc77d\uc5b4\uc11c \uac01\uc5fc\uae30\uc11c\uc5f4\uc758 description\uc0ad\uc81c\ndef process_file(filename,target_val):\n f = open(filename) #'datasets\\\\corona-nucleo-chicken-complete.fasta')\n lines = \"\"\n s1 = list()\n step = 0\n term = 0\n for line in f:\n line = ''.join(line.split())\n #print('step: ',step,' ',line)\n if line.startswith(\">\") and step==0:\n line = line.split('>',1)[0].strip()\n step = step + 1\n if line.startswith(\">\") and step>=1:\n line = line.split('>',1)[0].strip()\n s1.append(lines)\n lines = \"\"\n step = step + 1\n term = 0\n lines = lines + line\n \n count_vect_df = generate_ngrams(s1) \n count_vect_df['target'] = target_val\n return count_vect_df", "execution_count": 5, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "# \ub2ed, \uc624\ub9ac, \uc18c, \ubc15\uc950 \ub370\uc774\ud130\uc14b \ucc98\ub9ac\ndf1 = process_file('genome/gallus_gallus.fasta',\"chicken\")\ndf2 = process_file('genome/anatidae.fasta',\"duck\")\ndf3 = process_file('genome/bos_taurus.fasta',\"cattle\")\ndf4 = process_file('genome/chiropter.fasta',\"bat\")", "execution_count": 6, "outputs": []}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "# \uac01 \ub370\uc774\ud130\uc14b\uc758 % \uadf8\ub798\ud504\ub85c \ucd9c\ub825\nimport matplotlib.pyplot as plt\nplot_size = plt.rcParams[\"figure.figsize\"]\nplot_size[0] = 8\nplot_size[1] = 6\nplt.rcParams[\"figure.figsize\"] = plot_size\n\ndf=pd.concat([df1,df2,df3,df4])\n#df=df.fillna(0)\ndf=df.dropna(axis=1)\ndf['target'].value_counts().plot(kind='pie', autopct='%1.0f%%')", "execution_count": 8, "outputs": [{"output_type": "stream", "text": "/opt/conda/envs/Python36/lib/python3.6/site-packages/ipykernel/__main__.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\nof pandas will change to not sort by default.\n\nTo accept the future behavior, pass 'sort=False'.\n\nTo retain the current behavior and silence the warning, pass 'sort=True'.\n\n", "name": "stderr"}, {"output_type": "execute_result", "execution_count": 8, "data": {"text/plain": ""}, "metadata": {}}, {"output_type": "display_data", "data": {"text/plain": "
", "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWgAAAFbCAYAAAD4CxTWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xm83OPd//HXZ9azz8l+EhFjJ9GE277TVrU9lKpS1Yqb0pb2FkqN/pShLVO0FHUjbrW3lNQ2LUFJCEKKSIgtMZFIIslZ5uzrXL8/voMj29lm5vrOzOf5eMyjOXNmeZ9o3uea6/v9XpcYY1BKKeU+HtsBlFJKbZoWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtFJKuZQWtAUicoeIHL+J+yeIyIP9PDchIqOzl04p5RY+2wHU54wxq4CNilspVZx0BJ0DInKKiLwpIgtF5O703YeIyIsisuzT0bSIhEVkcfrPXhG5RkQWpZ/78w1es1REnhCRM9Jf/0BEXhGRN0TkFhHxpu9vEZHfpd/7ZREZl8MfXSk1DFrQWSYiU4D/B3zZGDMNOCf9rfHAQcBRQGwTTz0T2BbYwxgzFbi3z/cqgMeA+4wxM0VkV+BE4EBjzO5AL3By+rHlwMvp954LnJHJn08plT06xZF9XwYeNMasBzDG1IsIwMPGmBTw9mZGtV8FbjbG9Hz6vD7fewS4yhjzaWl/BdgTeDX92qXA2vT3uoDH03/+D3BEpn4wpVR2aUFnnwBmE/d3bvCYgT4PYB7wDRG5zxhj0o+90xhz0SYe251+DDgj64L7bx6OxEtwPin0vZXh/P31AN0b3HpwfnE1J2K1TTYyKzUQBfeP1YWeAf4hItcaY+pEZOQAnzcb+ImIPGeM6RGRkX1G0ZcAvwZuAn6afo9H0u+xNv0elcaY5Zn+YXIlHImXA1sDW23mNh4I4RSxdxjv0wM0AHXAOmBNn9ty4H3g/USstmGo76HUUGlBZ5kx5i0R+R0wR0R6gdcH+NTbgJ2AN0WkG5gJ3Njn+zOA20XkKmPML0XkYmC2iHhwRoln4xSMq4Uj8TJgMjClz203nHLe1CeLTPMBY9K3XTb3oHAkXke6rPvc3gTeScRqUznIqYqQfP7pV6nsCkfiAZy58oOAA4CpOAdCc1HE2dIMvAa8ArwKvJKI1br+F6PKD1rQKmvCkXg1cGD6dhCwN1BiNVRurMUp6znA7ESsdqHlPCpPaUGrjAlH4oJTwkelb7uT36PjTFkDPIVzXOGpRKz2E8t5VJ7QglbDkj6YdwRwNPBNoMZuItczOHPXTwAPJWK1r1rOo1xMC1oNWjgSrwS+m74dDgTtJsprS4EHgL8lYrVv2g6j3EULWg1IOBL34IyUTwG+jXMxjMqsJcD9OGX9ru0wyj4taLVF4Uh8CjAd59LxCZbjFJMFwM3AfYlYbbvtMMoOLWi1kfSVeScBZwF7WY5T7BqBO4GbErHa92yHUbmlBa0+E47Et8K5wOUMQNecdhcD/Bvn6tFHErHaXst5VA5oQSvCkfg04HycFfH8luOo/q0E/gjckojVttkOo7JHC7qIhSPxQ3HW9PiK7SxqSNbiFPVNiVhts+0wKvO0oItQOBLfF/gdWsyFoh74E3B9IlbbaDuMyhwt6CKSnsr4Lc5VfqrwJIE/A9fo6nuFQQu6CIQj8V2Ay3H2O9RLrwtfPXAZztRHj+0waui0oAtYOBIfDVwBnMYw1kxWeesd4PxErDZuO4gaGi3oApS+6u/HONMZA90gQBWup4DzErHaxbaDqMHRgi4w6QOAf8ZZd1mpT/XibAJxcSJWu952GDUwWtAFIhyJj8HZHfy/0XlmtXnrcUbTd9sOovqnBV0AwpH4acA1wAjbWVTemA38JBGr/dB2ELV5WtB5LByJj8P52KqnzamhaAUiwJ8TsVotAhfSgs5T4Uj8OzirnemaGWq4ngNOT8Rql9kOor5ICzrPpPf5uxFn+U+lMqUV+FkiVnuH7SDqc1rQeSQciR8B3A5MtJ1FFay7gLMSsdpW20GUFnReSJ/X/BvgIvQMDZV97wAnJGK1i2wHKXZa0C4XjsRHAX/F2W5KqVzpAM5JxGpvtR2kmGlBu1g4Et8beBCYZDuLKlp/BX6sy5na4bEdQG1aOBI/A3geLWdl10nA/HAkvp3tIMVIR9AuE47EgziXap9uO4tSfawHvp2I1b5gO0gx0YJ2kXAkPhJ4FDjQdhalNqEL+JFeJp47WtAuEY7Ew8C/gF0sR1GqP78Dfq1XH2afFrQLhCPxPYB/AjW2syg1QH8Hpidite22gxQyLWjLwpH413DO1Ki0nUWpQZoPfEO318oePYvDonAkPh2Io+Ws8tO+wLPppW5VFmhBWxKOxH8B3AH4LEdRajimAXPCkfgE20EKkRa0BelyvsZ2DqUyZFdgbjgS38Z2kEKjBZ1j4Uj8PLScVeHZHqekd7AdpJBoQedQOBI/F/iD7RxKZckk4PlwJD7ZdpBCoWdx5Eg4Ep8BXGs7h1I5sBo4ULfTGj4dQeeAlrMqMuOBp9Jbsqlh0BF0loUj8f/GWWRfqWKzEDg0EatN2g6Sr7SgsygciR8JPI6eSqeK1/PAkXrF4dBoQWdJOBLfHZiLXoSi1OM4K+H12A6Sb3QOOgvCkfgknLU1tJyVgqPQab4h0YLOsPSu2//COVCilHL8MByJX2w7RL7RKY4MCkfiAeBJ4DDLUZRyIwMcm4jVPmo7SL7QEXRm3YKWs1KbI8A9eiHLwGlBZ0g4Ej8LONV2DqVcrhJ4JByJj7AdJB/oFEcGhCPxA4FnAb/tLErlidnANxOx2l7bQdxMR9DDlL5a6u9oOSs1GF8DrrIdwu10BD0M4UjcAzwNHG47i1J56juJWO0s2yHcSkfQw3MZWs5KDcfMcCQ+0XYIt9IR9BCFI/EjcE6pE9tZlMpzzwJfTcRqU7aDuI2uETEE6YtRbqfIyznV0ULdv66na/1HAIz+5jm0L11A2wfzQQRvWTWjvjkDX+UoWt+dR/L5e/GUVjDmuIvxllbR3bCaxrl3MeaYCy3/JMqyw4ELgN/bDuI2OoIegnAkfhfwQ9s5bFsf/yPBiVOonHYkprcb090J4sETLAOgacGjdNd9xKgjf8aau89n7AmX07pkLqa3m6o9j2bdo1dRfdDJ+EduZfknUS7QDRyQiNUusB3ETXQOepDCkfixaDmT6myjY8VbVEz9GgDi9eMpqfisnAFMdweffcgQj1PiPZ2Ix0vHisV4y0doOatP+YH7wpF4ue0gbqJTHIMQjsRH41wtWPR6GtfgLaui7p/X0bX2Q4I1OzDiK2fiCZTQMPcuWhf/G0+wjHEnXQlA6MCTWPvAJXgrRjL66PNZ93CM0Tq1ob5oR+A64AzbQdxCpzgGIRyJPwh8x3YON+hc/T5r7v4FNT+4muCEnal/+hY8gTKqD/n8w0XypQcwPd1UH3zyF57bsugZUp0tBMfvTNMrs/CUVDDiq2fi8Zfk+sdQ7nR4Ilb7nO0QbqBTHAMUjsS/j5bzZ3yVo/FWjiY4YWcAynY+kK5Pln7hMeWTD6PtvXlfuC/V3UHL4meo3KOWhrl3MuqbMwjU7EDrW8/lKrpyv1vCkXjQdgg30IIegPTUxg22c7iJt2IEvqrRdNetBKBj+UL8oyfRXf/xZ49p+2A+/pFfPMW1af5DVO31LcTrw3R3OXeKB9PTmbPsyvV2An5lO4Qb6BTHAIQj8VvRebGNdH2yjLonrsf09uCrrmHUN2dQ/6/r6a5fCeLBVzWGkUeeja9yNAA9zXXUP3kjY4+/FIDWd14g+cJ9eErKnVPvykI2fxzlLl3A1ESs9l3bQWzSgu5HOBLfC5iPftpQKtfmJGK1h9kOYZOWzhaEI3EBbkT/npSy4dBwJP7ftkPYpMWzZacC+9oOoVQRuzociY+0HcIWLejNCEfiISBmO4dSRW4U8GvbIWzRgt68y4GxtkMopTgrHIlvbzuEDVrQmxCOxHcCzrKdQykFQIAi/TSrBb1pl6KXwSvlJseHI/GiOx6kBb2BcCQ+Bfie7RxKqY1caTtArmlBbyyK/r0o5UaHpzfKKBpaRH2EI/Fp6HobSrnZFbYD5JIW9BddTpHvkqKUy+0VjsS/ZjtErmhBp4Uj8b2Bb9nOoZTqV9EsJK4F/blLbAdQSg3Il9Nr5BQ8LWg+O++51nYOpdSA/dJ2gFzQgnb8Dzr3rFQ++U44Et/BdohsK/qCDkfi1TiLIiml8ocHON92iGwr+oIGfgToTsJK5Z/p4Uh8nO0Q2VTUBR2OxL3Az2znUEoNSQkFvtNRURc0cCywje0QSqkhOy29sUZBKvaCPsd2AKXUsGwLfMV2iGwp2oJOn1p3sO0cSqlh+5HtANlStAUN/MB2AKVURhwbjsRH2Q6RDUVZ0Ok5Ky1opQpDEPih7RDZUJQFDRyIM3ellCoMp9sOkA3FWtAF+dtWqSK2WzgS38d2iEwruoIOR+JB4ATbOZRSGfdd2wEyregKGjgaqLYdQimVcd+2HSDTirGgT7IdQCmVFdund0UqGEVV0OnpjaLZjUGpInSc7QCZVFQFDRwGVNgOoZTKGi3oPKaL8itV2HYLR+I72g6RKVrQSqlCUzCj6KIp6HAkPhnYznYOpVTWHW07QKYUTUGjo2elisU+4Ui8IDbhKKaCPsp2AKVUTvhxlnPIe0VR0OnfpgfYzqGUypnDbQfIhKIoaGAfwGc7hFIqZ7Sg80hBfNxRSg3YnuFIvNJ2iOHSglZKFSIfBbBjUsEXdHpx/v1s51BK5VzeT3MUfEEDU9DV65QqRnl/YkAxFHTe/0dSSg3J1HAkntcdl9fhB0jnn5UqThXADrZDDEcxFPRetgMopazZ3XaA4Sjogg5H4gFgJ9s5lFLWaEG72K7oBSpKFTMtaBf7ku0ASimr9rAdYDgKvaAn2w6glLKqJhyJj7MdYqgKvaB3sR1AKWVd3vZAoRf0zrYDKKWs29Z2gKEq2IIOR+Je8vwcSKVURoRtBxiqgi1oYCsgYDuEUsq6sO0AQ1XoBa2UUmHbAYaqkAt6vO0ASilXCNsOMFSFXNATbAdQSrnCxHAknpcXrPVb0CKy0WJDm7rPhbSglVIAXmCi7RBDMZAR9A0DvM9ttKCVUp/KyynPzQ77RWR/nLWUx4jIeX2+VYXzG8nt8vI/iFIqK/Jy044tzcsEcNZT9QF9N19sAo7PZqgM0RG0UupThVXQxpg5wBwRucMYs1xEyo0xrTnMNlyjbQdQSrlGXhb0QOagJ4jI28ASABGZJiI3ZTdWRpTZDqCUco2CLejrgCOBOgBjzELgkGyGyhAtaKXUpwq2oDHGrNjgrt4sZMmYcCTuRxfqV0p9Li8LeiAltkJEDgCMiASA/yE93eFiOnpWSvWVlwU9kBH0T4Czcda2WImzhczZ2QyVAVrQSqm+ym0HGIp+R9DGmPXAyTnIkkmltgMopVwlH67d2Ei/BS0i12/i7iSwwBjzSOYjZYSOoJVSfeXlukMDCV2CM63xfvo2FRgJnC4i12Ux23D4bQdQSrlKYY6gcXYl+bIxpgdARP4XmA0cASzKYrbh6LYdQNk3nro1twUue+/J3lDDku5gYESjmHENMDaZ8oxqSQWr2nvLSrpT5YDYzmrDyq5O369XLp/Y0NPj8wgcVT2y4bQxNXW/X7Vy3KutzZXbBUvar5q07ccAD9WvDyV7e3ynjamps517KNp9gTXEam3HGLSBFPRWOBPsyfTX5cAEY0yviHRmLdnwdNkOoOwK0dL4bPC85hLpPmSKZz1dfjofH1u+8K5QVfdSv28qIpUAvl7TNbKZdWMbTeO4BlrG15vOmkZSo5uMN9RCSXknlYEeRophtOTpx+TNGZnqITp2LJNLSmhN9XJ8IlFzRGlJzbKOVv617bZcsGpV8OPWxupJ/gDPJuu5deLW+Hs6amznHoryno7VtjMMxUAK+irgDRF5DmekcQhwhYiUA09nMdtwaEEXsRI62+cGZ3xUIt1TP70vAMHjWlr3Oa6llU6h45GKivl3V1X2Jvy+aWurZau11bLV4vDmX9OTMr3VLXwyJkl9TaNpqak3HTUNpMYkjYxoIVjeQUWwmxEewxjJkym2MT4fY3xOBZR7vGwXDLK6p5tuYzDG0GlS+BBur6/nB9Uj8Etef9BI2Q4wFFssaBERnOmMfwL74BT0r4wxq9IPuSC78YZMC7pIeent+XfwF4tC0rbP5h4TNJSc0Nyy7wnNLXSItM+qKH/pnlClrPD5piKyyQPMKY9466sYV1/FuHe33kJRGWOq2qgb00T9uAbTVNNAe02D6RnTaGRkC8HKNspKuqn2phgjLjrb6OPuLpZ0dLB3zXiWVXRx3PIE+5WVUen1srijnbNG5/3SNj2ZeBERiQItxphrBvm8BLBX+qy4AdtiQRtjjIg8bIzZE3DrGRubogVdlIyJB3718gSpP2igzygxpvT7zS37f7+5hTaR1ocqK168t6rS+7HPOw2RkkFHEJGmckY1lTNq6fgtjzjLOkxydBN14xpMsqaBtpoG0zO2EUY1G19VG2WlXVT7ehklzhK/WdOaSnHOxx9z0dhxVHi9nD5qFKePGgXAr9es5mejx/BgYyPz2lrZORjkJ6PysqzbbQcYioFMcbwsInsbY17NeprM0YIuQvf4r5y7i2fFoUN9fpkx5T9saj7gh03NtIg0P1BV8drfqip9q73eaYgEM5kVoK1EQh+VEPpo7JaLPNBt2kY3sW5so0nWNNA6vt50jW3EjG42vqpWSss6qfL3MMrjnF01KN3GMOPjjzmqKsQRlZVf+N7bHR0AhAMBrlz7CXdP2oZfrPqYRFcX4UBgsG9lW9tQnygi/w84BVgBrAP+k57yPd8Ys0BERuOcdhwWES/we5z1iwww0xhzQ5/XKgX+ATxkjJnZ33sPpKAPB34sIsuBVpxpDmOMmbrlp1mlBV1krvLdMucg7+Ihl/OGKoypPC3ZfMBpyWaaPJK8v7Jywf1VFYFPvN7dEcnpHHOXX8pWjWKbVaO2XOQbHfBsMJ3jGpwDntWtlJR3UNHngKfXGMOv16xmu2CAU0du3O03rF/HZTU19BhDyjj3eRA6Unk5nTukpZJFZE/ge8AeOH35GvCfLTzlTGBbYA9jTI+I9P2LrQD+BtxljLlrIO8/kIL+xkBeyGU6cQ4KFNRRd7VpM3wPPn+Cb07GynlDVSkTOiPZdOAZySaSHk/jfVUVi/9eWVGyzilr1yzK1eOVwNpqBnbAs5W1qQUtbS/c2RQeWelvfXJlq/GmjJwxYczKb5VV9Txb11QzuaSkeqzP7wWYVlrKMR9+yE7BILuUDH7mxwWahvi8g4F/GGPaAETk0X4e/1Xg5k9PSzbG1Pf53iPAVcaYewf65gO51Ht5OthYnItWXC8RqzXhSLwOGGM7i8qu73n/Pf8c76wDcvV+oVSq+qeNTQf9tLGJBo+n/p6qyrceqqoor/N4puF8vHW9lEe89ZWM5fBKdjt8N+izTsUs2HkWACPBGPNGO/VjktTVNNQ0n5SeJ3/LOeAZqGynvKSLUPqAp9uv3k32/5DNMpu4r4fPB4B9e1E283iAecA3ROQ+Y8zmHvMFA7nU+1vAH3C2kFoLbIOzmt2UgbyBRWvRgi5oh3teX3il77ZpInauEhuRSo38eWPy4J83JqnzeNbfFapa8o/K8ooGp6zz/9ObiDSXMbK5jJHL+jngWdphmkY3UTfOmSdvG19vujc44BlKH/AM5Sj9hoZa0HOBO0QkhtOXRwO3AAlgT+AVvrgF4GzgJyLy3KdTHH1G0ZcAvwZuAn46kDcfyMez3wD7AU8bY/YQkcOBkwby4patxf2/RNQQTZMP3rvdf3VYxB2f6kalUqPPbWg8+NyGRtZ5PevuCFUteaSiPJT0eL5UEGXdj/YSqVpRQtWK/g94to9yDng21jTQOr4hfcCzyfhDrZSkD3iOFBgpmb3Cc91QnmSMeU1E7gfeAJYDz6e/dQ3wgIj8EPh3n6fcBuwEvCki3cBM4MY+358B3C4iVxljftnf+0t/I20RWWCM2UtEFuJMfKdE5BVjzGbPM3WDcCT+N+BE2zlU5oVl9YpnAucHvWLG2s7SnzVe75q/hKrefayifGSzR3ZD8vtqj1zx9prukc2sG5s0DRse8BzRsvEBzwG85GG7vrNkTtaDZ9hARtCNIlKBM9S/V0TWkh9rXay1HUBl3lga1s0O/DKVD+UMUNPbW3NRfUPNRfUNrPJ5V98eqnovXlE+usXjsf7pzqQMS6NL8Y/ws82527Di5hV0rOygcvdKao53ruhe+8haSrYuoeq/snoq9kZ6veJfV82EddUy4a1tNv84MSZV3cLaMUnqaxpMc02D6ahpoHdM0niqWwhUdFBR0k11t5c1uUufOQMp6IU45xCei7MudAjndBG304IuMBW0NT0XPK8uIL272M4yFBN6esdfXNcw/uK6Blb4fCtvq65a+kR52Zg2j2eyjTx1s+sITgiSak/RscI553nH3+7IsiuW0dvWS6orRfuydsYe497fhUbE01DJ2IZKxr43cYsfTta4dWW3LRnI3NjhxpiUMabHGHOnMeZ6YO9sB8sALegCEqC7c27w3GVl0pmX5byhrXt6Jl62vv7Q+ctXTn5sxaqPjmlumVOaSr2Tq/fvru+meWEzIw4Z4dzhBdNtMCmD6THggbWz1jL2OPeW8yC0LJq+aDhncViz2RG0iPwUOAvYXkTe7POtSpzTRdxuVf8PUfnAQ6r36cD5r4+U5v1sZ8mGcE/PpN+ur5/02/X1LPX7EjOrQ4mny0q36vR4dszWe66+bzU1J9bQ2+7s/1wyoQT/SD9LL11K9QHVdH3iXOtVuo1rlgsZjo9tBxiqLU1x3Af8C7gSiPS5v3mDk6/d6gPbAVRm/CNwyYuTPOsOtp0jF7bv7gnH1tWFAd71+5fdVl310b/Lyrbu8sj2mXqPpjea8FX5KA2X0rKk5bP7x588/rM/L792ORNOncDaR9fSsaKDiikVjDxs0FeSu8VK2wGGarMFbYxJ4pw7mA+n1G3KUqCXPN1JQTlm+v8wZ5pnWdauEnSznbu7t7t6Xd12UMfbAf8HM6tDH88pK53ULbLtcF637f02ml5vonlhM6bb0NvRy4pbVrD1j7cGoOm1Jkq3LSXVmaLz404mnT2JZVcso3r/ajzBvDxj8H3bAYbKNZepZloiVtsdjsQTQMZGHiq3LvPdMecI73+Kspw3NLmre4dr167fAeDNYOC9maGq1S+UlYZ7RLZwjsOm1Xy3hprvOmdptCxpoe6Jus/K2fQY6p6qY5sZ29D5SefnZyIb53tkfMmonMjZ3H6mFWxBp72HFnReOtP7+LxTvLMPsZ3DjaZ2du10w9r1OwG8Hgy8M7M6tOal0pLte0S2Hu5r1z1TR/WBzki5ZOsSMPD+xe9TObUSb3nefhjN24Lu90KVfBaOxK8DzrGdQw3OMZ55C67z/3maSH7sTOIWr5YE374tVLVufmnJDr0iW9nO4yLhRdMXLbcdYiiKYQSt8siBnsWLr/P/ebKW8+Dt3dE5ee+OdRgw80uCi2dWh+oWlAR3SomM7//ZBasN+Mh2iKHSglauMVkSS+/2X7GViOtXRnM1Admvo3O3/dasxYCZV1ry5m2hqsbXS4I7p0TG2c6XY+8tmr4ob6cJCr2g37YdQA3MRFm36tHAxWUeYYTtLIVEQA5q75h6UHsHKUjNLS1d+H/VVcmFwcCuRqQYVnvM2/lnKPA5aIBwJP4xzlKpyqVG0FT/cvBnjUHp2c52lmLRC73PlpUuuj1U1bQ4GJhiREbZzpQlFy2avihmO8RQFfoIGpz1Wo+1HUJtWhkdrXOD564OSo/1xYOKiRe8X21r3/2rbe30QM/T5WX/+Uuosn1JIDDFiBTSp5gFtgMMR16edT5Ir9gOoDbNT0/Xc8Fz36mUdi1ni3zg+3pr2573r/rkoP8kVlRcuXb9gp07u17AuVgtnxmyXNAicpiIHNDn62NFZHKfr+8QkeM3/ez+FcMIer7tAGpjQir1RODCBWMlmbPtqlT//OA/qrVtr6Na2+iCrn9WlL96Z6iy6wO//0uI5HbN0eH7YNH0RY1Zfo/DgBbgxfTXxwKPk6HjX8VQ0AvQDWRd5/7Ab57f3rNarxJ0sQAEjm1p3fvYlla6oPOxivJX7gpVdS/z+6bhrBHvdi8P9YkicgpwPs4o/E3gAeBiIADU4Sy9XAr8BOgVkR/gXHPxLeBQEbkY+M4Gr7kn8Eec5ZrXA6caY1ZvMUehHyQECEfibwFW1txVG7vef8Ocb3lf0nLOU51Cx8MVFW/cHao0y32+qYiU9/8sK366aPqimwf7JBGZAswCDjTGrBeRkThF3WiMMSLyI2BXY8wvRCQKtBhjrkk/9w7gcWPMg32/xtnRew5wjDFmnYicCBxpjDltS1mKYQQNzjy0FrQLRHz3zdVyzm9BQ8mJzS37ndjcQrtI26zK8pfuraqUFT7fNETctD7pS0N83peBB40x6wGMMfUi8iXgfnEu+gkAHw7yNXcGdgOeSu965gW2OHqG4inoF4BTbYcodqd4n3zpx97HD7KdQ2VOqTFlJze17H9yUwutIi0PVla8eF9VpXeVz7s7IjaXVmoEFg/xuYIzYu7rBuCPxphHReQwIDqE13zLGLP/YJ5ULPOys20HKHZf98x/7TLfnXuKFM3/54pOuTEV05uaD3hy5ap9X1y+snNGfeOLNT09r2BMl4U4zyyavqh3qM8FTpD0ueHpKY4Qny/8P73PY5txNjHZ3NefehcYIyL7p1/Tn55K2aKi+MeSiNWuAJbYzlGs9pZ3lvyv/087ihCwnUXlRqXA+cXdAAAQyUlEQVQxVacnmw54asWqfeZ9tLL9Zw2NL4zt6VmAMbnacHrIgzJjzFvA74A5IrIQ58BeFPi7iDyPc4DvU48B3xaRN0TkYOBvwAUi8rrI55ssGOeX1PHA79Ov+QbQ7xlMRXGQECAciV8LzLCdo9jsKCsTTwYurPCIGW07i7Kv0eNpuLeq8q0HKytK13s90xDJ1jRr3q5g11cxFfTXcbbwUjkynro1c4MzevzSO9F2FuU+9R5P3d2hyrdnVVaU13s80xDJ1ILT7y2avmjnDL2WVcVykBCcU1w6gBLbQYpBiJbGZ4PnNfulN2sbn6r8NjKVGnVOQ/LgcxqSrPd61t1ZVfXOw5XllY0ez1REhjP9+mTGQlpWNCNogHAkPhs4wnaOQldCZ/v84Nnvh6Rtqu0sKv+s9XrX/iVUueTRivIRTR7Pl0iflzYIRy+avujxrITLsaI4SNjHE7YDFDovvT3/Dv5ikZazGqqxvb1jL6xvPHTeRx9Pnb1i1ZqTmprnVPamFjOw0WQH8Gy2M+ZKsY2gd0QX8c8iY54IRObt4lmh5zqrjFvp8358e6jqg39WlI9u9Xg2d4raI4umLyqY1SuLqqABwpH4G8A02zkK0d3+K+cc7F2kVwmqrPvI51t5W3XV0ifLy8a2eTy79vnWKYumL7rbWrAMK6aDhJ96AC3ojPu971YtZ5Uzk3p6Jl6+vn7i5evr+dDvWz4zVPXh0+VlY9o9nkdtZ8ukYhxB7wC8bztHITnH+9AL5/of0mkNZdtjRJPfsh0ik4rtICGJWO0H5PkuC27yXe9zr8zwPTSo9QWUypL7bAfItKIr6LR7bQcoBId7Xl94le/WL4mQqQsMlBqqVqCgpjegeAv6r8BQF1JRwDT54L3b/VeHRXDT8pKqeD1KNNlmO0SmFWVBJ2K1nwBP2c6Rr8KyesWswKUhEUK2syiVNtN2gGwoyoJOu9V2gHw0msZ1swO/THnFjLOdRam0d4kmC+bilL6KuaAfBVbYDpFPKmhrmhs8ty4gvdvYzqJUH4Pe1ipfFG1BJ2K1vcAttnPkiwDdnXOD5y4rk85dbGdRqo924E7bIbKlaAs6bSZgY7eHvOIh1ft04PzXR0rz7razKLWB+4kmG2yHyJaiLuhErHYt8KDtHG73j8AlL07yrNvPdg6lNqFgpzegyAs67c+2A7jZTP8f5kzzLDvYdg6lNuF1osn5tkNkU9EXdCJW+yLO/mBqA5f57phzhPc/ur6GcqsbbAfItqIv6LRrbQdwmzO9j887xTv7ENs5lNqM5cA9tkNkmxa04z5gqe0QbnGMZ96Ci3z37SPCYHeyUCpXriKazNUO4dZoQQOJWG0PcKXtHG5woGfx4uv8f54sgt92FqU2YzVwu+0QuaAF/bm7gITtEDZNlsTSu/1XbCVCme0sSm3BH4gmO2yHyAUt6LRErLabIh5FT5R1qx4NXFzmEUbYzqLUFtRR4KfW9aUF/UV3AB/ZDpFrI2iqfybwiw6fpMbbzqJUP64jmmy1HSJXtKD7SMRqu4Df286RS2V0tM4Nnrs6KD3b2c6iVD8aKYJT6/rSgt7Y/1Ekiyj56Ol+LnjuO5XSvrkdkpVyk98QTSZth8glLegNJGK1ncCFtnNknzFPBi58dawk97SdRKkB+AC40XaIXNOC3oRErPavwEu2c2TT/YHfzN3es/oA2zmUGqALiSaLbmEzLejNOwcoyC3P/+S/Yc6+nnf0Em6VL54nmpxlO4QNWtCbkYjVvkoBXkp6oe+vc4/xvqTlrPKFAc6zHcIWLegti+DsFlwQfuid/fJPvI8dZDuHUoNwL9HkAtshbNGC3oJErHYVELOdIxOO9Lzy+uW+O/YQ0f/mKm+0Ar+yHcIm/cfav2uAD22HGI695N0lN/uv20GEoO0sSg3CxUSTRXHK6+ZoQfcjEavtAM6wnWOodpSViQcCl48WodJ2FqUGYT5wve0QtmlBD0AiVvsMzv6FeWU8dWv+GbjI5xEzxnYWpQahCzidaDJlO4htWtADdz55dIVhiJbGZ4PnNfuld6LtLEoN0pVEk2/ZDuEGWtADlIjVNgE/tp1jIErobJ8bnPFRiXTvaDtLvujoMewzs4VpN7cw5aYWLn3WWc3y5Flt7HxjC7vd1MJpj7TT3eucGv/Q291MuamFg//SSl2bM9BbWp/iew+2WfsZCsRbwBW2Q7iFGFOQ12JkTTgSvxM4xXaOzfHS2/N88JzXJkj9Praz5BNjDK3dUBEQunsNB/2llT99vYT6dsM3dvAB8P1Z7RwyycdP9w5wwP+18uQPyvjb4m46euDn+wY46aE2Lj8syI6jvJZ/mryVAg4kmnzZdhC30BH04M3A2dHBhYyJB371spbz4IkIFQFnh6/uFHT3ggDf3NGPiCAi7DPBy8omZ7TsEejsNbR1G/xeeH55D+MrPFrOw3OtlvMXaUEPUiJW2wD8CBdeBn63PzZ3F88KvRBliHpTht1vbmHs1c0csZ2PfSf6Pvted6/h7je7+Xp6NH3poUGOvKeNpz/s5aTd/Pz2+U5+fYiexTgMrwIX2Q7hNjrFMUThSPxqnAOHrvB7361zTvQ9p5dwZ0Bjh+Hb97dxwzdK2G2sMyI+49F2ygPCdV8v2ejxd77RRWOHYd+JXq55sYsRJcKfvlFCmV/33B2gJmAPoslltoO4jY6gh+4i4EXbIQDO8T70gpZz5lSXCIdt4+OJD3oAuOy5Tta1Gf545MYj5LZuw50Luzlr7wAXPdPJ7ceUsucEL/e+WfAbTmfSGVrOm6YFPUTpncC/h7NHmjXf9T73ygzfQ/vbzFAI1rWmaOxwPk22dxue/rCHXUZ7uO21Lp5c2sNfv1OKRzYeEV81r5Nz9g3g9wrt3c68tUec4lYDcivR5AO2Q7iVTnEMUzgSrwUew/m3mVOHe15feLv/6p1EKM31exeaNz/pZfrD7fSmIGXghCl+Ljk0iO/yJrapFirTBxCP29W5H2BVc4ozH+vg8e87m6D//a1uonM6qS4RHj6xlDHlOv7px2JgH6LJdttB3EoLOgPCkfhVwAW5fM9p8sF7DwcuGSdCKJfvq1SGtAF7E02+bTuIm+mv+Mz4FTAvV28WltUrZgUuDWk5qzxlgFO1nPunBZ0B6fno44BEtt9rNI3rZgd+mfKKGZft91IqSy4lmvy77RD5QKc4MigciU/GObMjKyPbCtqaXgmevapMOnfJxusrlQN/JZr8vu0Q+UJH0BmUiNW+DZwA9GT6tQN0d84JnrtUy1nlsZeB02yHyCda0BmWiNXOBn6eydcUUqmnAxe8Pkqa98jk6yqVQx8BxxJNdtgOkk+0oLMgEau9Gfhjpl7vH4FL503yrN0vU6+nVI61AN8imvzEdpB8owWdPRcAjwz3RW7x/2HO7p6lB2cgj1I2dAHfJZpcaDtIPtKCzpJErDYFnAQ8O9TXuNR355wjvf/RS7hVvuoFTiKafMJ2kHylBZ1FiVhtO3A0QzhH+kzv4/NO9T55SOZTKZUTn57rPMt2kHymBZ1liVhtK/BNnOUUB+Roz4sLLvLdt49I7i8fVypDziKavMd2iHynBZ0D6e2yjgT6nYc7wLP4rev9N+4qgj/7yZTKiguIJm+2HaIQ6IUqORSOxEcDc4DJm/r+rrJ8aTzwqxEeMSNzm0ypjLmcaPJS2yEKhRZ0joUj8Rqckt6p7/0TZd2q5wLnik9S4+0kU2rYLiOajNoOUUh0iiPHErHaNcAh9JnuGEFT/TOBX3RoOas8dr6Wc+bpCNqScCQeAuJldOw+P3h2olLap9jOpNQQpHAOCN5iO0gh0hG0JYlYbRL42qzApfdpOas81QV8T8s5e3QEbVs05ANuA6bbjqLUIDQD3yaafMZ2kEKmBe0W0dDvcBb+V8rtVgNHEU2+ZjtIodOCdpNo6AzgRiBgO4pSm/EKzsh5le0gxUAL2m2ioQOAh4Aa21GU2sA9wBm6ZGjuaEG7UTS0FTAL2Md2FKVwztSIEE1ebTtIsdGCdqtoKAjcDJxqOYkqbkmcFen+ZTtIMdKCdrto6Oc4i//7bEdRRecdnPnmd2wHKVZ6HrTbRZM3AF8BVtiOoorKbcCeWs526Qg6X0RD1ThTHifajqIKWgNwJtHkg7aDKC3o/BMNnYJzKl6l7Siq4MwFfkA0qZ/WXEKnOPJNNHkXMA140XYUVTB6gEuAw7Wc3UVH0PkqGvLiXHl4CXoAUQ3dEuB0osmXbAdRG9OCznfR0O44c9P72o6i8kon8Dvg90STXbbDqE3Tgi4E0ZAH+DFwBVBtOY1yv+eAHxNNvmc7iNoyLehCEg2NA64FTrIdRblSPc5+gbfbDqIGRgu6EEVDRwA3ATvYjqJcwQD3AucRTa6zHUYNnBZ0oYqGSoAZwIXotEcxew5n1LzAdhA1eFrQhS4aGglcBPwMKLGcRuXOW8CFRJNx20HU0GlBF4toaGvgMuAUwGs5jcqeVcClwF+IJntth1HDowVdbKKhyThnexxjO4rKqDqcA8TXEk222Q6jMkMLulhFQ3sBFwDfQUfU+WwFzmqHM4kmW22HUZmlBV3soqHtgPOA/wbKLKdRA7cEuAq4l2iy23YYlR1a0MoRDY3GOZB4NjDachq1efOBGPAI0aT+4y1wWtDqi6KhUmA6cCawh+U0ytEC/A1nGuMV22FU7mhBq82LhvYATgNOBkZYTlOMFgAzgb8STTbbDqNyTwta9c+56OXbwOnAlwGxG6igNeFc9Xcr0eQbtsMou7Sg1eBEQ2GctT6+DextN0zBaAQeBR4CZhNNdljOo1xCC1oNXTQ0ETgWOA44BD1dbzDWAw/jlPIzeiaG2hQtaJUZ0dAo4CickfVXgAq7gVxpCfAU8AgwR6/0U/3RglaZFw35gH1wivrLwH4U5zogK4A5wDPA00STKy3nUXlGC1plXzQUwJmvPjh9+y+gxmqmzOsEFgGv4+wXOYdo8kO7kVS+04JWdkRDY3E2v+172wXw24w1QE3AGzhl/DrwGrCEaLLHaipVcLSglXs4I+1dge2BbYBJ6f/99DYqR0lSOKvCLQM+3MTtY72KT+WCFrTKH9FQOU5pjwJCQFX6f0MbfO3F2UVkc7dWoKHPrXGDr9foRqrKDbSglVLKpTy2AyillNo0LWillHIpLWillHIpLWilckxEwiKyeBCPP1VEJmQzk3InLWil3O9UQAu6CGlBK2WHT0TuFJE3ReRBESkTkUtE5FURWSwit4rjeGAv4F4ReUNESm0HV7mjBa2UHTsDtxpjpuJcmXgWcKMxZm9jzG5AKXCUMeZBnIX7TzbG7G6MabcXWeWaFrRSdqwwxsxL//ke4CDgcBGZLyKLcBaZmmItnXIFn+0AShWpDa8QM8BNwF7GmBUiEqU4VwBUfegIWik7JonI/uk/nwS8kP7zehGpAI7v89hmoDKX4ZQ76AhaKTuWANNF5BbgfeB/cTbmXQQkgFf7PPYO4GYRaQf213no4qFrcSillEvpFIdSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrmUFrRSSrnU/wdSjbuOLU+vGgAAAABJRU5ErkJggg==\n"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "markdown", "source": "## Data Preprocessing"}, {"metadata": {}, "cell_type": "code", "source": "cov = process_file('genome/homo_sapiens.fasta',\"COVID-19\")\n\n#\ubaa8\ub378\ub85c \uc0ac\uc6a9\ud560 \uac83\uc774 \uc544\ub2c8\uae30\ub54c\ubb38\uc5d0 target\uc740 drop\ncov = cov.drop('target', axis=1)", "execution_count": 9, "outputs": []}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "# \ub450 \ub370\uc774\ud130\uc14b\uc758 column \uac1c\uc218\nprint('Number of Column(df) : ',len(df.columns))\nprint('Number of Column(cov) : ',len(cov.columns))", "execution_count": 11, "outputs": [{"output_type": "stream", "text": "Number of Column(df) : 348\nNumber of Column(cov) : 923\n", "name": "stdout"}]}, {"metadata": {}, "cell_type": "code", "source": "# \ub3d9\ubb3c\uc5d0\uac90 \uc788\uace0 \uc0ac\ub78c\uc5d0\uac90 \uc5c6\ub294 \uc5f4\ucc3e\uae30\ny=df.pop('target')\nmc = df.columns.difference(cov.columns)\nmc\n#\ud574\ub2f9 \uc5f4 \uc0ad\uc81c\ndf = df.drop(mc, axis=1)", "execution_count": 12, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "#\uc0ac\ub78c\uc5d0\uac90 \uc788\uace0 \ub3d9\ubb3c\uc5d0\uac90 \uc5c6\ub294 \uc5f4\ucc3e\uae30\nrf = cov.columns.difference(df.columns)\nrf\n#\ud574\ub2f9 \uc5f4 \uc0ad\uc81c\ncov = cov.drop(rf, axis=1)", "execution_count": 13, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "# \ub450 \ub370\uc774\ud130\uc14b\uc758 column \uac1c\uc218\nprint('Number of Column(df) : ',len(df.columns))\nprint('Number of Column(cov) : ',len(cov.columns))", "execution_count": 14, "outputs": [{"output_type": "stream", "text": "Number of Column(df) : 342\nNumber of Column(cov) : 342\n", "name": "stdout"}]}, {"metadata": {}, "cell_type": "markdown", "source": "## \ubaa8\ub378 \uc0dd\uc131 \ubc0f \ud2b8\ub808\uc774\ub2dd"}, {"metadata": {}, "cell_type": "code", "source": "#\ubaa8\ub378 \uc0dd\uc131\nfrom sklearn.model_selection import train_test_split \nfrom xgboost import XGBClassifier\nfrom xgboost import plot_importance\nimport xgboost\n\n#y=df.pop('target')\nX=df.values\n\n# create a train/test split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, shuffle=True)\n\nmodel = XGBClassifier()\nmodel.fit(X_train, y_train)", "execution_count": 15, "outputs": [{"output_type": "execute_result", "execution_count": 15, "data": {"text/plain": "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n silent=True, subsample=1)"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "markdown", "source": "## Prediction"}, {"metadata": {}, "cell_type": "code", "source": "# dataframe -> numpy expression\nc=cov.values", "execution_count": 25, "outputs": []}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "model.predict(c)", "execution_count": 18, "outputs": [{"output_type": "execute_result", "execution_count": 18, "data": {"text/plain": "array(['bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat'], dtype=object)"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "code", "source": "import numpy as np\nprint(model.classes_)\nsimilarities = model.predict_proba(c)\nnp.round(similarities, 3)", "execution_count": 19, "outputs": [{"output_type": "stream", "text": "['bat' 'cattle' 'chicken' 'duck']\n", "name": "stdout"}, {"output_type": "execute_result", "execution_count": 19, "data": {"text/plain": "array([[0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.983, 0.001, 0.014, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001]], dtype=float32)"}, "metadata": {}}]}], "metadata": {"kernelspec": {"name": "python3", "display_name": "Python 3.6", "language": "python"}, "language_info": {"name": "python", "version": "3.6.9", "mimetype": "text/x-python", "codemirror_mode": {"name": "ipython", "version": 3}, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py"}}, "nbformat": 4, "nbformat_minor": 1}