{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DNA Classification\n", "> In this project, it will show the Machine Learning Model for classifying DNA sequence. K-Nearest Neighborhood and Support Vector Machine and several algorithm for classification will be used. The original data is from UCI Machine Learning Repository.\n", "\n", "- toc: true \n", "- badges: true\n", "- comments: true\n", "- author: Chanseok Kang\n", "- categories: [Python, Machine_Learning]\n", "- image: images/dna_sequence.jpg" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Required Packages" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import numpy as np\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import sklearn\n", "\n", "plt.rcParams['figure.figsize'] = (8, 8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Version check" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python: 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]\n", "Numpy: 1.18.1\n", "Pandas: 1.0.1\n" ] } ], "source": [ "print('Python: {}'.format(sys.version))\n", "print('Numpy: {}'.format(np.__version__))\n", "print('Pandas: {}'.format(pd.__version__))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare dataset\n", "The original data is from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Molecular+Biology+%28Promoter+Gene+Sequences%29)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ClassidSequence
0+S10\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1+AMPC\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2+AROH\\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...
3+DEOP2\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4+LEU1_TRNA\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
\n", "
" ], "text/plain": [ " Class id Sequence\n", "0 + S10 \\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...\n", "1 + AMPC \\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...\n", "2 + AROH \\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...\n", "3 + DEOP2 \\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...\n", "4 + LEU1_TRNA \\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# UCI molecular biology (promoter gene sequences dataset)\n", "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'\n", "names = ['Class', 'id', 'Sequence']\n", "\n", "data = pd.read_csv(url, names=names)\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocess Data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+ 53\n", "- 53\n", "Name: Class, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# build dataset using a custom pandas dataframe\n", "# each column in a dataframe is called a series\n", "classes = data.loc[:, 'Class']\n", "classes.value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']\n" ] } ], "source": [ "# Generate list of DNA sequences\n", "sequences = data.loc[:, 'Sequence'].tolist()\n", "dataset = {}\n", "\n", "# Loop throught the sequences and split into individual nucleotides\n", "for i, seq in enumerate(sequences):\n", " # split into nucleotides, remove tab characters\n", " nucleotides = list(seq)\n", " nucleotides = [x for x in nucleotides if x != '\\t']\n", " \n", " # Append class assignment\n", " nucleotides.append(classes[i])\n", " \n", " # add to dataset\n", " dataset[i] = nucleotides\n", " \n", "print(dataset[0])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...48495051525354555657
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
..................................................................
101cctcaatggc...gaactatat-
102gtattctcaa...tcaacattg-
103cgcgactacg...aaggcttcc-
104ctcgtcctca...aggaggaac-
105taacattaat...tcaagaact-
\n", "

106 rows × 58 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57\n", "0 t a c t a g c a a t ... g c t t g t c g t +\n", "1 t g c t a t c c t g ... c a t c g c c a a +\n", "2 g t a c t a g a g a ... c a c c c g g c g +\n", "3 a a t t g t g a t g ... a a c a a a c t c +\n", "4 t c g a t a a t t a ... c c g t g g t a g +\n", ".. .. .. .. .. .. .. .. .. .. .. ... .. .. .. .. .. .. .. .. .. ..\n", "101 c c t c a a t g g c ... g a a c t a t a t -\n", "102 g t a t t c t c a a ... t c a a c a t t g -\n", "103 c g c g a c t a c g ... a a g g c t t c c -\n", "104 c t c g t c c t c a ... a g g a g g a a c -\n", "105 t a a c a t t a a t ... t c a a g a a c t -\n", "\n", "[106 rows x 58 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Turn dataset into pandas dataframe\n", "df = pd.DataFrame(dataset).T\n", "df" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...484950515253545556Class
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
..................................................................
101cctcaatggc...gaactatat-
102gtattctcaa...tcaacattg-
103cgcgactacg...aaggcttcc-
104ctcgtcctca...aggaggaac-
105taacattaat...tcaagaact-
\n", "

106 rows × 58 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class\n", "0 t a c t a g c a a t ... g c t t g t c g t +\n", "1 t g c t a t c c t g ... c a t c g c c a a +\n", "2 g t a c t a g a g a ... c a c c c g g c g +\n", "3 a a t t g t g a t g ... a a c a a a c t c +\n", "4 t c g a t a a t t a ... c c g t g g t a g +\n", ".. .. .. .. .. .. .. .. .. .. .. ... .. .. .. .. .. .. .. .. .. ...\n", "101 c c t c a a t g g c ... g a a c t a t a t -\n", "102 g t a t t c t c a a ... t c a a c a t t g -\n", "103 c g c g a c t a c g ... a a g g c t t c c -\n", "104 c t c g t c c t c a ... a g g a g g a a c -\n", "105 t a a c a t t a a t ... t c a a g a a c t -\n", "\n", "[106 rows x 58 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# rename the last column to class\n", "df.rename(columns={57: 'Class'}, inplace=True)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Describe Dataset" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...484950515253545556Class
count106106106106106106106106106106...106106106106106106106106106106
unique4444444444...4444444442
toptaacaaaaaa...cccttccct+
freq38343030364238343336...36423133353229293453
\n", "

4 rows × 58 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 \\\n", "count 106 106 106 106 106 106 106 106 106 106 ... 106 106 106 \n", "unique 4 4 4 4 4 4 4 4 4 4 ... 4 4 4 \n", "top t a a c a a a a a a ... c c c \n", "freq 38 34 30 30 36 42 38 34 33 36 ... 36 42 31 \n", "\n", " 51 52 53 54 55 56 Class \n", "count 106 106 106 106 106 106 106 \n", "unique 4 4 4 4 4 4 2 \n", "top t t c c c t + \n", "freq 33 35 32 29 29 34 53 \n", "\n", "[4 rows x 58 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...484950515253545556Class
t38.026.027.026.022.024.030.032.032.028.0...21.022.023.033.035.030.023.029.034.0NaN
c27.022.021.030.019.018.021.020.022.022.0...36.042.031.032.021.032.029.029.017.0NaN
a26.034.030.022.036.042.038.034.033.036.0...23.024.028.027.025.022.026.024.027.0NaN
g15.024.028.028.029.022.017.020.019.020.0...26.018.024.014.025.022.028.024.028.0NaN
+NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN53.0
-NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN53.0
\n", "

6 rows × 58 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 48 \\\n", "t 38.0 26.0 27.0 26.0 22.0 24.0 30.0 32.0 32.0 28.0 ... 21.0 \n", "c 27.0 22.0 21.0 30.0 19.0 18.0 21.0 20.0 22.0 22.0 ... 36.0 \n", "a 26.0 34.0 30.0 22.0 36.0 42.0 38.0 34.0 33.0 36.0 ... 23.0 \n", "g 15.0 24.0 28.0 28.0 29.0 22.0 17.0 20.0 19.0 20.0 ... 26.0 \n", "+ NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "- NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "\n", " 49 50 51 52 53 54 55 56 Class \n", "t 22.0 23.0 33.0 35.0 30.0 23.0 29.0 34.0 NaN \n", "c 42.0 31.0 32.0 21.0 32.0 29.0 29.0 17.0 NaN \n", "a 24.0 28.0 27.0 25.0 22.0 26.0 24.0 27.0 NaN \n", "g 18.0 24.0 14.0 25.0 22.0 28.0 24.0 28.0 NaN \n", "+ NaN NaN NaN NaN NaN NaN NaN NaN 53.0 \n", "- NaN NaN NaN NaN NaN NaN NaN NaN 53.0 \n", "\n", "[6 rows x 58 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Record value counts for each sequence\n", "series = []\n", "\n", "for name in df.columns:\n", " series.append(df[name].value_counts())\n", " \n", "info = pd.DataFrame(series)\n", "details = info.T\n", "details" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...55_a55_c55_g55_t56_a56_c56_g56_tClass_+Class_-
00001100001...0010000110
10001001001...1000100010
20010000110...0100001010
31000100000...0001010010
40001010000...1000001010
\n", "

5 rows × 230 columns

\n", "
" ], "text/plain": [ " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 55_a 55_c 55_g \\\n", "0 0 0 0 1 1 0 0 0 0 1 ... 0 0 1 \n", "1 0 0 0 1 0 0 1 0 0 1 ... 1 0 0 \n", "2 0 0 1 0 0 0 0 1 1 0 ... 0 1 0 \n", "3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 \n", "4 0 0 0 1 0 1 0 0 0 0 ... 1 0 0 \n", "\n", " 55_t 56_a 56_c 56_g 56_t Class_+ Class_- \n", "0 0 0 0 0 1 1 0 \n", "1 0 1 0 0 0 1 0 \n", "2 0 0 0 1 0 1 0 \n", "3 1 0 1 0 0 1 0 \n", "4 0 0 0 1 0 1 0 \n", "\n", "[5 rows x 230 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Switch to numerical data using pd.get_dummies()\n", "numerical_df = pd.get_dummies(df)\n", "numerical_df.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...54_t55_a55_c55_g55_t56_a56_c56_g56_tClass
00001100001...0001000011
10001001001...0100010001
20010000110...0010000101
31000100000...0000101001
40001010000...1100000101
..................................................................
1010100010000...1100000010
1020010000110...1000100100
1030100001001...1010001000
1040100000101...0100001000
1050001100010...0010000010
\n", "

106 rows × 229 columns

\n", "
" ], "text/plain": [ " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 54_t 55_a 55_c \\\n", "0 0 0 0 1 1 0 0 0 0 1 ... 0 0 0 \n", "1 0 0 0 1 0 0 1 0 0 1 ... 0 1 0 \n", "2 0 0 1 0 0 0 0 1 1 0 ... 0 0 1 \n", "3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 \n", "4 0 0 0 1 0 1 0 0 0 0 ... 1 1 0 \n", ".. ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "101 0 1 0 0 0 1 0 0 0 0 ... 1 1 0 \n", "102 0 0 1 0 0 0 0 1 1 0 ... 1 0 0 \n", "103 0 1 0 0 0 0 1 0 0 1 ... 1 0 1 \n", "104 0 1 0 0 0 0 0 1 0 1 ... 0 1 0 \n", "105 0 0 0 1 1 0 0 0 1 0 ... 0 0 1 \n", "\n", " 55_g 55_t 56_a 56_c 56_g 56_t Class \n", "0 1 0 0 0 0 1 1 \n", "1 0 0 1 0 0 0 1 \n", "2 0 0 0 0 1 0 1 \n", "3 0 1 0 1 0 0 1 \n", "4 0 0 0 0 1 0 1 \n", ".. ... ... ... ... ... ... ... \n", "101 0 0 0 0 0 1 0 \n", "102 0 1 0 0 1 0 0 \n", "103 0 0 0 1 0 0 0 \n", "104 0 0 0 1 0 0 0 \n", "105 0 0 0 0 0 1 0 \n", "\n", "[106 rows x 229 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Remove one of the class columns and rename to simply 'Class'\n", "df = numerical_df.drop(columns=['Class_-'])\n", "\n", "df.rename(columns={'Class_+':'Class'}, inplace=True)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build the Machine Learning Model" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.neural_network import MLPClassifier\n", "from sklearn.gaussian_process import GaussianProcessClassifier\n", "from sklearn.gaussian_process.kernels import RBF\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "from sklearn.metrics import classification_report, accuracy_score\n", "from sklearn.model_selection import KFold, train_test_split, cross_val_score" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# Create X and y dataset for training\n", "X = df.drop(['Class'], axis=1).to_numpy()\n", "y = df['Class'].to_numpy()\n", "\n", "# Split the data into training and test dataset\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "K Nearest Neighbors: 0.7946428571428571 (0.1569499627789906)\n", "Gaussian Process: 0.9125 (0.08003905296791061)\n", "Decision Tree: 0.7839285714285714 (0.15980895801307413)\n", "Random Forest: 0.6339285714285714 (0.187329854774395)\n", "Neural Network: 0.8732142857142857 (0.0969726713027533)\n", "AdaBoost: 0.8482142857142858 (0.12222689256176861)\n", "Naive Bayes: 0.8607142857142858 (0.11785714285714285)\n", "SVM Linear: 0.8964285714285714 (0.08253014291636673)\n", "SVM RBF: 0.8607142857142858 (0.13044273119821195)\n", "SVM Sigmoid: 0.95 (0.09999999999999999)\n" ] } ], "source": [ "# Define scoring method\n", "scoring = 'accuracy'\n", "\n", "# Define models to train\n", "names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', \n", " 'Neural Network', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']\n", "\n", "classifiers = [\n", " KNeighborsClassifier(n_neighbors=3),\n", " GaussianProcessClassifier(1.0 * RBF(1.0)),\n", " DecisionTreeClassifier(max_depth=5),\n", " RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n", " MLPClassifier(alpha=1, max_iter=500),\n", " AdaBoostClassifier(),\n", " GaussianNB(),\n", " SVC(kernel='linear'),\n", " SVC(kernel='rbf'),\n", " SVC(kernel='sigmoid')\n", "]\n", "\n", "models = zip(names, classifiers)\n", "\n", "# Evaluate each model in turn\n", "results = []\n", "names = []\n", "\n", "for name, model in models:\n", " kfold = KFold(n_splits=10, shuffle=True)\n", " cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)\n", " results.append(cv_results)\n", " names.append(name)\n", " msg = '{0}: {1} ({2})'.format(name, cv_results.mean(), cv_results.std())\n", " print(msg)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "K Nearest Neighbors\n", "0.8518518518518519\n", " precision recall f1-score support\n", "\n", " 0 1.00 0.73 0.85 15\n", " 1 0.75 1.00 0.86 12\n", "\n", " accuracy 0.85 27\n", " macro avg 0.88 0.87 0.85 27\n", "weighted avg 0.89 0.85 0.85 27\n", "\n", "Gaussian Process\n", "0.9259259259259259\n", " precision recall f1-score support\n", "\n", " 0 0.88 1.00 0.94 15\n", " 1 1.00 0.83 0.91 12\n", "\n", " accuracy 0.93 27\n", " macro avg 0.94 0.92 0.92 27\n", "weighted avg 0.93 0.93 0.92 27\n", "\n", "Decision Tree\n", "0.8148148148148148\n", " precision recall f1-score support\n", "\n", " 0 0.86 0.80 0.83 15\n", " 1 0.77 0.83 0.80 12\n", "\n", " accuracy 0.81 27\n", " macro avg 0.81 0.82 0.81 27\n", "weighted avg 0.82 0.81 0.82 27\n", "\n", "Random Forest\n", "0.8148148148148148\n", " precision recall f1-score support\n", "\n", " 0 0.92 0.73 0.81 15\n", " 1 0.73 0.92 0.81 12\n", "\n", " accuracy 0.81 27\n", " macro avg 0.82 0.82 0.81 27\n", "weighted avg 0.84 0.81 0.81 27\n", "\n", "Neural Network\n", "0.9259259259259259\n", " precision recall f1-score support\n", "\n", " 0 0.88 1.00 0.94 15\n", " 1 1.00 0.83 0.91 12\n", "\n", " accuracy 0.93 27\n", " macro avg 0.94 0.92 0.92 27\n", "weighted avg 0.93 0.93 0.92 27\n", "\n", "AdaBoost\n", "0.9259259259259259\n", " precision recall f1-score support\n", "\n", " 0 1.00 0.87 0.93 15\n", " 1 0.86 1.00 0.92 12\n", "\n", " accuracy 0.93 27\n", " macro avg 0.93 0.93 0.93 27\n", "weighted avg 0.94 0.93 0.93 27\n", "\n", "Naive Bayes\n", "0.9259259259259259\n", " precision recall f1-score support\n", "\n", " 0 0.88 1.00 0.94 15\n", " 1 1.00 0.83 0.91 12\n", "\n", " accuracy 0.93 27\n", " macro avg 0.94 0.92 0.92 27\n", "weighted avg 0.93 0.93 0.92 27\n", "\n", "SVM Linear\n", "0.8888888888888888\n", " precision recall f1-score support\n", "\n", " 0 0.88 0.93 0.90 15\n", " 1 0.91 0.83 0.87 12\n", "\n", " accuracy 0.89 27\n", " macro avg 0.89 0.88 0.89 27\n", "weighted avg 0.89 0.89 0.89 27\n", "\n", "SVM RBF\n", "0.9259259259259259\n", " precision recall f1-score support\n", "\n", " 0 0.88 1.00 0.94 15\n", " 1 1.00 0.83 0.91 12\n", "\n", " accuracy 0.93 27\n", " macro avg 0.94 0.92 0.92 27\n", "weighted avg 0.93 0.93 0.92 27\n", "\n", "SVM Sigmoid\n", "0.8518518518518519\n", " precision recall f1-score support\n", "\n", " 0 0.87 0.87 0.87 15\n", " 1 0.83 0.83 0.83 12\n", "\n", " accuracy 0.85 27\n", " macro avg 0.85 0.85 0.85 27\n", "weighted avg 0.85 0.85 0.85 27\n", "\n" ] } ], "source": [ "models = zip(names, classifiers)\n", "# Test the algorithm on the validation dataset\n", "for name, model in models:\n", " model.fit(X_train, y_train)\n", " predictions = model.predict(X_test)\n", " print(name)\n", " print(accuracy_score(y_test, predictions))\n", " print(classification_report(y_test, predictions))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }