{"cells": [{"metadata": {}, "cell_type": "code", "source": "from nltk.corpus import stopwords \nfrom sklearn.feature_extraction.text import CountVectorizer\nimport pandas as pd\nimport numpy as np", "execution_count": 1, "outputs": []}, {"metadata": {}, "cell_type": "markdown", "source": "## Data Download"}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "import os\n# wget\uc744 \ud65c\uc6a9\ud574\uc11c data \ub2e4\uc6b4\ub85c\ub4dc\n!wget https://.../Downloads.zip\n\nif \"genome\" not in os.listdir():\n os.makedirs(\"genome\")\nelse:\n pass\n\nimport zipfile\nimport shutil\n \ngenome_zip = zipfile.ZipFile('Downloads.zip')\ngenome_zip.extractall('genome')\n \ngenome_zip.close()", "execution_count": null, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "os.listdir('genome')", "execution_count": 3, "outputs": [{"output_type": "execute_result", "execution_count": 3, "data": {"text/plain": "['anatidae.fasta',\n 'gallus_gallus.fasta',\n 'chiropter.fasta',\n 'bos_taurus.fasta',\n 'homo_sapiens.fasta']"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "markdown", "source": "## Data Load & Tokenization"}, {"metadata": {}, "cell_type": "code", "source": "#\uc5fc\uae30\uc11c\uc5f4\uc744 \ud1a0\ud070\ubcc4\ub85c \uc798\ub77c\uc11c \uc218\uce58\ud654\ndef generate_ngrams(s1):\n count_vect = CountVectorizer(lowercase=False, ngram_range=(2,4),analyzer='char')\n X1 = count_vect.fit_transform(s1)\n \n lcount = list()\n lcount = []\n for i in s1:\n count = len(i)\n #print(count)\n lcount.append(count)\n \n count_vect_df = pd.DataFrame(X1.todense(), columns=count_vect.get_feature_names())\n count_vect_df=count_vect_df.apply(lambda x: x / lcount[x.name] ,axis=1)\n\n return count_vect_df", "execution_count": 4, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "#\ud30c\uc77c\ub85c\ubd80\ud130 \uc2a4\ud2b8\ub9c1\uc77d\uc5b4\uc11c \uac01\uc5fc\uae30\uc11c\uc5f4\uc758 description\uc0ad\uc81c\ndef process_file(filename,target_val):\n f = open(filename) #'datasets\\\\corona-nucleo-chicken-complete.fasta')\n lines = \"\"\n s1 = list()\n step = 0\n term = 0\n for line in f:\n line = ''.join(line.split())\n #print('step: ',step,' ',line)\n if line.startswith(\">\") and step==0:\n line = line.split('>',1)[0].strip()\n step = step + 1\n if line.startswith(\">\") and step>=1:\n line = line.split('>',1)[0].strip()\n s1.append(lines)\n lines = \"\"\n step = step + 1\n term = 0\n lines = lines + line\n \n count_vect_df = generate_ngrams(s1) \n count_vect_df['target'] = target_val\n return count_vect_df", "execution_count": 5, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "# \ub2ed, \uc624\ub9ac, \uc18c, \ubc15\uc950 \ub370\uc774\ud130\uc14b \ucc98\ub9ac\ndf1 = process_file('genome/gallus_gallus.fasta',\"chicken\")\ndf2 = process_file('genome/anatidae.fasta',\"duck\")\ndf3 = process_file('genome/bos_taurus.fasta',\"cattle\")\ndf4 = process_file('genome/chiropter.fasta',\"bat\")", "execution_count": 6, "outputs": []}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "# \uac01 \ub370\uc774\ud130\uc14b\uc758 % \uadf8\ub798\ud504\ub85c \ucd9c\ub825\nimport matplotlib.pyplot as plt\nplot_size = plt.rcParams[\"figure.figsize\"]\nplot_size[0] = 8\nplot_size[1] = 6\nplt.rcParams[\"figure.figsize\"] = plot_size\n\ndf=pd.concat([df1,df2,df3,df4])\n#df=df.fillna(0)\ndf=df.dropna(axis=1)\ndf['target'].value_counts().plot(kind='pie', autopct='%1.0f%%')", "execution_count": 8, "outputs": [{"output_type": "stream", "text": "/opt/conda/envs/Python36/lib/python3.6/site-packages/ipykernel/__main__.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\nof pandas will change to not sort by default.\n\nTo accept the future behavior, pass 'sort=False'.\n\nTo retain the current behavior and silence the warning, pass 'sort=True'.\n\n", "name": "stderr"}, {"output_type": "execute_result", "execution_count": 8, "data": {"text/plain": ""}, "metadata": {}}, {"output_type": "display_data", "data": {"text/plain": "
", "image/png": "\n"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "markdown", "source": "## Data Preprocessing"}, {"metadata": {}, "cell_type": "code", "source": "cov = process_file('genome/homo_sapiens.fasta',\"COVID-19\")\n\n#\ubaa8\ub378\ub85c \uc0ac\uc6a9\ud560 \uac83\uc774 \uc544\ub2c8\uae30\ub54c\ubb38\uc5d0 target\uc740 drop\ncov = cov.drop('target', axis=1)", "execution_count": 9, "outputs": []}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "# \ub450 \ub370\uc774\ud130\uc14b\uc758 column \uac1c\uc218\nprint('Number of Column(df) : ',len(df.columns))\nprint('Number of Column(cov) : ',len(cov.columns))", "execution_count": 11, "outputs": [{"output_type": "stream", "text": "Number of Column(df) : 348\nNumber of Column(cov) : 923\n", "name": "stdout"}]}, {"metadata": {}, "cell_type": "code", "source": "# \ub3d9\ubb3c\uc5d0\uac90 \uc788\uace0 \uc0ac\ub78c\uc5d0\uac90 \uc5c6\ub294 \uc5f4\ucc3e\uae30\ny=df.pop('target')\nmc = df.columns.difference(cov.columns)\nmc\n#\ud574\ub2f9 \uc5f4 \uc0ad\uc81c\ndf = df.drop(mc, axis=1)", "execution_count": 12, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "#\uc0ac\ub78c\uc5d0\uac90 \uc788\uace0 \ub3d9\ubb3c\uc5d0\uac90 \uc5c6\ub294 \uc5f4\ucc3e\uae30\nrf = cov.columns.difference(df.columns)\nrf\n#\ud574\ub2f9 \uc5f4 \uc0ad\uc81c\ncov = cov.drop(rf, axis=1)", "execution_count": 13, "outputs": []}, {"metadata": {}, "cell_type": "code", "source": "# \ub450 \ub370\uc774\ud130\uc14b\uc758 column \uac1c\uc218\nprint('Number of Column(df) : ',len(df.columns))\nprint('Number of Column(cov) : ',len(cov.columns))", "execution_count": 14, "outputs": [{"output_type": "stream", "text": "Number of Column(df) : 342\nNumber of Column(cov) : 342\n", "name": "stdout"}]}, {"metadata": {}, "cell_type": "markdown", "source": "## \ubaa8\ub378 \uc0dd\uc131 \ubc0f \ud2b8\ub808\uc774\ub2dd"}, {"metadata": {}, "cell_type": "code", "source": "#\ubaa8\ub378 \uc0dd\uc131\nfrom sklearn.model_selection import train_test_split \nfrom xgboost import XGBClassifier\nfrom xgboost import plot_importance\nimport xgboost\n\n#y=df.pop('target')\nX=df.values\n\n# create a train/test split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, shuffle=True)\n\nmodel = XGBClassifier()\nmodel.fit(X_train, y_train)", "execution_count": 15, "outputs": [{"output_type": "execute_result", "execution_count": 15, "data": {"text/plain": "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n silent=True, subsample=1)"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "markdown", "source": "## Prediction"}, {"metadata": {}, "cell_type": "code", "source": "# dataframe -> numpy expression\nc=cov.values", "execution_count": 25, "outputs": []}, {"metadata": {"scrolled": true}, "cell_type": "code", "source": "model.predict(c)", "execution_count": 18, "outputs": [{"output_type": "execute_result", "execution_count": 18, "data": {"text/plain": "array(['bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat', 'bat',\n 'bat'], dtype=object)"}, "metadata": {}}]}, {"metadata": {}, "cell_type": "code", "source": "import numpy as np\nprint(model.classes_)\nsimilarities = model.predict_proba(c)\nnp.round(similarities, 3)", "execution_count": 19, "outputs": [{"output_type": "stream", "text": "['bat' 'cattle' 'chicken' 'duck']\n", "name": "stdout"}, {"output_type": "execute_result", "execution_count": 19, "data": {"text/plain": "array([[0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.983, 0.001, 0.014, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.985, 0.001, 0.012, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001],\n [0.987, 0.001, 0.011, 0.001]], dtype=float32)"}, "metadata": {}}]}], "metadata": {"kernelspec": {"name": "python3", "display_name": "Python 3.6", "language": "python"}, "language_info": {"name": "python", "version": "3.6.9", "mimetype": "text/x-python", "codemirror_mode": {"name": "ipython", "version": 3}, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py"}}, "nbformat": 4, "nbformat_minor": 1}