{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using Dimension Reduction to create synthetic Concept Drift"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn import linear_model\n",
    "from sklearn.metrics import accuracy_score\n",
    "from alibi_detect.cd import MMDDrift, FETDrift, CVMDrift, KSDrift\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from skmultiflow.drift_detection import DDM\n",
    "from skmultiflow.drift_detection.adwin import ADWIN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Call in data and use bag of words to get feature values for each item in terms of how often a word appears in the item name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "t1 = pd.read_excel('https://raw.githubusercontent.com/UNECE/ML_dataset/master/Stats%20Poland%20ECOICOP%20data.xlsx', sheet_name = 'English')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(token_pattern='\\w\\w+|[1-9]\\.[1-9]\\%|[1-9]\\,[1-9]\\%|[1-9]\\.[1-9]|[1-9]\\,[1-9]|[1-9]\\%')\n",
    "vectorizer.fit(t1['produkt'])\n",
    "X = pd.DataFrame(vectorizer.transform(t1['produkt']).todense(), columns=vectorizer.get_feature_names())#.to_numpy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use PCA to reduce dimensions then change some labels based on the PCA outputs. (Probably better to use a different method than PCA eventually)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.72556198,  0.06901139],\n",
       "       [ 0.69524086,  0.14352036],\n",
       "       [-0.24980595,  0.03262967],\n",
       "       ...,\n",
       "       [-0.26754292, -0.02725797],\n",
       "       [-0.26748707, -0.02712377],\n",
       "       [-0.26748707, -0.02712377]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca = PCA(n_components=2)\n",
    "pca.fit(X)\n",
    "X2 = pca.transform(X)\n",
    "X2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.72556184,  0.06892948, -0.37163337, -0.02271674, -0.0702986 ],\n",
       "       [ 0.69524059,  0.14204164, -0.30490279,  0.12051952, -0.04961939],\n",
       "       [-0.24980628,  0.03201551, -0.06682947,  0.09084413, -0.07885592],\n",
       "       ...,\n",
       "       [-0.2675442 , -0.02849962, -0.0842153 ,  0.00583738,  0.00302388],\n",
       "       [-0.26748833, -0.02835428, -0.08371878,  0.00587023,  0.00299765],\n",
       "       [-0.26748833, -0.02835428, -0.08371878,  0.00587023,  0.00299765]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca = PCA(n_components=5)\n",
    "pca.fit(X)\n",
    "X5 = pca.transform(X)\n",
    "X5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.72556183,  0.06902345, -0.37144639, ..., -0.07512212,\n",
       "         0.22133783,  0.52939237],\n",
       "       [ 0.69524052,  0.14197847, -0.30481788, ...,  0.3714309 ,\n",
       "         1.19984569, -0.35706358],\n",
       "       [-0.24980602,  0.03177583, -0.06657378, ...,  0.09136378,\n",
       "         0.47933746, -0.07830631],\n",
       "       ...,\n",
       "       [-0.26754369, -0.02833311, -0.08380358, ..., -0.09032943,\n",
       "        -0.08066121, -0.02543885],\n",
       "       [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202,\n",
       "        -0.08031969, -0.02511831],\n",
       "       [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202,\n",
       "        -0.08031969, -0.02511831]])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca = PCA(n_components=10)\n",
    "pca.fit(X)\n",
    "X10 = pca.transform(X)\n",
    "X10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0              Confectionery products\n",
       "1          Fruit and vegetable juices\n",
       "2        Artificial sugar substitutes\n",
       "3          Jams, marmalades and honey\n",
       "4          Jams, marmalades and honey\n",
       "                     ...             \n",
       "17094        Mineral or spring waters\n",
       "17095        Mineral or spring waters\n",
       "17096        Mineral or spring waters\n",
       "17097        Mineral or spring waters\n",
       "17098        Mineral or spring waters\n",
       "Name: kategoria, Length: 17099, dtype: object"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t1['kategoria']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "test=t1['kategoria'].to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "X10 = np.column_stack((X10,test)) #need to change variable name later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = round(X10.shape[0]/100)*-1\n",
    "n2 = round(X10.shape[0]/10)*-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = np.argpartition(X10[:,0], n)[n:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = t1['kategoria']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "category = np.amin(Y[temp])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y2 = Y.copy()\n",
    "Y2[temp] = category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3511     Baby food\n",
       "16901    Baby food\n",
       "10261    Baby food\n",
       "132      Baby food\n",
       "9621     Baby food\n",
       "           ...    \n",
       "2908     Baby food\n",
       "8726     Baby food\n",
       "10258    Baby food\n",
       "905      Baby food\n",
       "809      Baby food\n",
       "Name: kategoria, Length: 171, dtype: object"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y2[temp]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp2 = np.argpartition(X10[:,0], -5000)[-5000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "category2 = np.amin(Y[temp2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y3 = Y.copy()\n",
    "Y3[temp2] = \"Sugar\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8203     Sugar\n",
       "10269    Sugar\n",
       "16581    Sugar\n",
       "3119     Sugar\n",
       "4129     Sugar\n",
       "         ...  \n",
       "2908     Sugar\n",
       "8726     Sugar\n",
       "10258    Sugar\n",
       "905      Sugar\n",
       "809      Sugar\n",
       "Name: kategoria, Length: 5000, dtype: object"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y3[temp2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split data into train and test then train classifier and predict on test data to verify model performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.7, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SGDClassifier()"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = linear_model.SGDClassifier()\n",
    "clf.fit(Xtrain, Ytrain)\n",
    "#clf = MultinomialNB()\n",
    "#clf.fit(Xtrain, Ytrain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_predictions = clf.predict(Xtrain)\n",
    "test_predictions = clf.predict(Xtest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model training accuracy:  0.9778594702982706\n",
      "Model test accuracy:  0.8947368421052632\n"
     ]
    }
   ],
   "source": [
    "print(\"Model training accuracy: \", accuracy_score(Ytrain, train_predictions))\n",
    "print(\"Model test accuracy: \", accuracy_score(Ytest, test_predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy:  0.9599391777296918\n"
     ]
    }
   ],
   "source": [
    "clf.fit(X, Y)\n",
    "x_predictions = clf.predict(X)\n",
    "print(\"Accuracy: \", accuracy_score(Y2, x_predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy:  0.6873501374349377\n"
     ]
    }
   ],
   "source": [
    "print(\"Accuracy: \", accuracy_score(Y3, x_predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Confusion Matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creating a multiclass label confusion matrix and calculating different metrics from that for each individual label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "lbls = np.unique(Y3)\n",
    "cm = confusion_matrix(Y3, x_predictions, labels = lbls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 38,   0,   0, ...,   0,   0,   0],\n",
       "       [  0,  96,   0, ...,   0,   0,   0],\n",
       "       [  0,   0,  53, ...,   0,   0,   0],\n",
       "       ...,\n",
       "       [ 10, 207,   9, ...,  88, 195, 155],\n",
       "       [  0,   0,   0, ...,   0, 440,   0],\n",
       "       [  0,   0,   0, ...,   0,   0, 321]], dtype=int64)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "cmv = []\n",
    "metrics = []\n",
    "l = cm[0].size\n",
    "for i in range(l):\n",
    "    tp = cm[i,i]\n",
    "    fp = sum(cm[:,i])-tp\n",
    "    fn = sum(cm[i,:])-tp\n",
    "    tn = Y3.size-tp-fp-fn\n",
    "    cmv.append([tp,fp,fn,tn])\n",
    "    acc = (tp+tn)/(tp+fp+fn+tn)\n",
    "    prec = tp/(tp+fp)\n",
    "    rec = tp/(tp+fn)\n",
    "    f1 = 2*((prec*rec)/(prec+rec))\n",
    "    metrics.append([acc,prec,rec,f1])\n",
    "cmv = np.asarray(cmv)\n",
    "metrics = np.asarray(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "t1 = np.random.uniform(0.9,1,5000)\n",
    "t2 = np.random.uniform(0.1,0.2,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = np.concatenate((t1, t2), axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Drift Detection Methods Testing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is just some testing of different methods and quite messy. Would need some work to get certain parts to run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "ddm = DDM(min_num_instances=30, warning_level=2, out_control_level=3)\n",
    "m = 0 #set as metric column\n",
    "for i in range(6000):\n",
    "    ddm.add_element(t[i])\n",
    "    if ddm.detected_warning_zone():\n",
    "        print('Warning zone has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))\n",
    "    if ddm.detected_change():\n",
    "        print('Change has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = x_predictions\n",
    "Ys = np.unique(x_predictions)\n",
    "Ys = dict(zip(Ys, range(len(Ys))))\n",
    "test = np.vectorize(Ys.get)(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Change detected at index 1567, input value: 1\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "from river import drift\n",
    "\n",
    "rng = random.Random(12345)\n",
    "adwin = drift.ADWIN()\n",
    "\n",
    "# Simulate a data stream composed by two data distributions\n",
    "data_stream = rng.choices([0, 1], k=1000)\n",
    "# Increase the probability of 1's appearing in the next 1000 instances\n",
    "data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])\n",
    "\n",
    "# Update drift detector and verify if change is detected\n",
    "for i, val in enumerate(data_stream):\n",
    "    _ = adwin.update(val)\n",
    "    if adwin.drift_detected:\n",
    "        print(f\"Change detected at index {i}, input value: {val}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "from river import drift\n",
    "\n",
    "rng = random.Random(42)\n",
    "ddm = drift.DDM()\n",
    "\n",
    "# Simulate a data stream where the first 1000 instances come from a uniform distribution\n",
    "# of 1's and 0's\n",
    "data_stream = rng.choices([0, 1], k=1000)\n",
    "# Increase the probability of 1's appearing in the next 1000 instances\n",
    "data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])\n",
    "\n",
    "print_warning = True\n",
    " # Update drift detector and verify if change is detected\n",
    "for i, x in enumerate(test):\n",
    "    _ = ddm.update(x)\n",
    "    if ddm.warning_detected and print_warning:\n",
    "        print(f\"Warning detected at index {i}\")\n",
    "        print_warning = False\n",
    "    if ddm.drift_detected:\n",
    "        print(f\"Change detected at index {i}\")\n",
    "        print_warning = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "adwin = ADWIN()\n",
    "for i in range(l):\n",
    "    adwin.add_element(metrics[i,m])\n",
    "    if adwin.detected_change():\n",
    "        print('Change detected in data: ' + str(metrics[i,m]) + ' - at index: ' + str(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "from skmultiflow.drift_detection import PageHinkley\n",
    "ph = PageHinkley()\n",
    "for i in range(l):\n",
    "    ph.add_element(metrics[i,m])\n",
    "    if ph.detected_change():\n",
    "        print('Change has been detected in data: ' + str(metrics[i,m]) + ' - of index: ' + str(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([[3., 1., 3., 6., 1., 2., 1., 5., 2., 1., 2., 2., 3., 2., 4., 4.,\n",
       "         4., 1., 3., 3.]]),\n",
       " array([[1, 1, 1, 1, 0, 0, 0, 1, 0, 1]]))"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from skmultiflow.data.multilabel_generator import MultilabelGenerator\n",
    "# Setting up the stream\n",
    "stream = MultilabelGenerator(n_samples=100, n_features=20, n_targets=10, n_labels=10)\n",
    "stream.next_sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "lossRef = (x_predictions == Y.to_numpy()).astype(int)\n",
    "loss = (x_predictions == Y3.to_numpy()).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "fetDetective = FETDrift(lossRef, p_val=0.05, alternative='less')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "loss\n",
      "Drift? Yes!\n",
      "p-value: 0.0\n"
     ]
    }
   ],
   "source": [
    "losses = {'loss': loss}\n",
    "label = ['No!', 'Yes!']\n",
    "for name, lossArr in losses.items():\n",
    "    print('\\n%s' % name)\n",
    "    preds = fetDetective.predict(lossArr)\n",
    "    print('Drift? {}'.format(label[preds['data']['is_drift']]))\n",
    "    print('p-value: {}'.format(preds['data']['p_val'][0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import matplotlib.pyplot as plt\n",
    "#from sklearn.metrics import ConfusionMatrixDisplay\n",
    "#disp = ConfusionMatrixDisplay(confusion_matrix=cm)\n",
    "#disp.plot()\n",
    "#plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9634452094832361"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}