{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using Dimension Reduction to create synthetic Concept Drift" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.decomposition import PCA\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn import linear_model\n", "from sklearn.metrics import accuracy_score\n", "from alibi_detect.cd import MMDDrift, FETDrift, CVMDrift, KSDrift\n", "from sklearn.metrics import confusion_matrix\n", "from skmultiflow.drift_detection import DDM\n", "from skmultiflow.drift_detection.adwin import ADWIN" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Call in data and use bag of words to get feature values for each item in terms of how often a word appears in the item name." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "t1 = pd.read_excel('https://raw.githubusercontent.com/UNECE/ML_dataset/master/Stats%20Poland%20ECOICOP%20data.xlsx', sheet_name = 'English')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "vectorizer = CountVectorizer(token_pattern='\\w\\w+|[1-9]\\.[1-9]\\%|[1-9]\\,[1-9]\\%|[1-9]\\.[1-9]|[1-9]\\,[1-9]|[1-9]\\%')\n", "vectorizer.fit(t1['produkt'])\n", "X = pd.DataFrame(vectorizer.transform(t1['produkt']).todense(), columns=vectorizer.get_feature_names())#.to_numpy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use PCA to reduce dimensions then change some labels based on the PCA outputs. (Probably better to use a different method than PCA eventually)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.72556198, 0.06901139],\n", " [ 0.69524086, 0.14352036],\n", " [-0.24980595, 0.03262967],\n", " ...,\n", " [-0.26754292, -0.02725797],\n", " [-0.26748707, -0.02712377],\n", " [-0.26748707, -0.02712377]])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca = PCA(n_components=2)\n", "pca.fit(X)\n", "X2 = pca.transform(X)\n", "X2" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.72556184, 0.06892948, -0.37163337, -0.02271674, -0.0702986 ],\n", " [ 0.69524059, 0.14204164, -0.30490279, 0.12051952, -0.04961939],\n", " [-0.24980628, 0.03201551, -0.06682947, 0.09084413, -0.07885592],\n", " ...,\n", " [-0.2675442 , -0.02849962, -0.0842153 , 0.00583738, 0.00302388],\n", " [-0.26748833, -0.02835428, -0.08371878, 0.00587023, 0.00299765],\n", " [-0.26748833, -0.02835428, -0.08371878, 0.00587023, 0.00299765]])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca = PCA(n_components=5)\n", "pca.fit(X)\n", "X5 = pca.transform(X)\n", "X5" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.72556183, 0.06902345, -0.37144639, ..., -0.07512212,\n", " 0.22133783, 0.52939237],\n", " [ 0.69524052, 0.14197847, -0.30481788, ..., 0.3714309 ,\n", " 1.19984569, -0.35706358],\n", " [-0.24980602, 0.03177583, -0.06657378, ..., 0.09136378,\n", " 0.47933746, -0.07830631],\n", " ...,\n", " [-0.26754369, -0.02833311, -0.08380358, ..., -0.09032943,\n", " -0.08066121, -0.02543885],\n", " [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202,\n", " -0.08031969, -0.02511831],\n", " [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202,\n", " -0.08031969, -0.02511831]])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca = PCA(n_components=10)\n", "pca.fit(X)\n", "X10 = pca.transform(X)\n", "X10" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Confectionery products\n", "1 Fruit and vegetable juices\n", "2 Artificial sugar substitutes\n", "3 Jams, marmalades and honey\n", "4 Jams, marmalades and honey\n", " ... \n", "17094 Mineral or spring waters\n", "17095 Mineral or spring waters\n", "17096 Mineral or spring waters\n", "17097 Mineral or spring waters\n", "17098 Mineral or spring waters\n", "Name: kategoria, Length: 17099, dtype: object" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t1['kategoria']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "test=t1['kategoria'].to_numpy()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "X10 = np.column_stack((X10,test)) #need to change variable name later" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "n = round(X10.shape[0]/100)*-1\n", "n2 = round(X10.shape[0]/10)*-1" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "temp = np.argpartition(X10[:,0], n)[n:]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "Y = t1['kategoria']" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "category = np.amin(Y[temp])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "Y2 = Y.copy()\n", "Y2[temp] = category" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "3511 Baby food\n", "16901 Baby food\n", "10261 Baby food\n", "132 Baby food\n", "9621 Baby food\n", " ... \n", "2908 Baby food\n", "8726 Baby food\n", "10258 Baby food\n", "905 Baby food\n", "809 Baby food\n", "Name: kategoria, Length: 171, dtype: object" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y2[temp]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "temp2 = np.argpartition(X10[:,0], -5000)[-5000:]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "category2 = np.amin(Y[temp2])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "Y3 = Y.copy()\n", "Y3[temp2] = \"Sugar\"" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8203 Sugar\n", "10269 Sugar\n", "16581 Sugar\n", "3119 Sugar\n", "4129 Sugar\n", " ... \n", "2908 Sugar\n", "8726 Sugar\n", "10258 Sugar\n", "905 Sugar\n", "809 Sugar\n", "Name: kategoria, Length: 5000, dtype: object" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y3[temp2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split data into train and test then train classifier and predict on test data to verify model performance" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.7, random_state=42)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SGDClassifier()" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf = linear_model.SGDClassifier()\n", "clf.fit(Xtrain, Ytrain)\n", "#clf = MultinomialNB()\n", "#clf.fit(Xtrain, Ytrain)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "train_predictions = clf.predict(Xtrain)\n", "test_predictions = clf.predict(Xtest)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model training accuracy: 0.9778594702982706\n", "Model test accuracy: 0.8947368421052632\n" ] } ], "source": [ "print(\"Model training accuracy: \", accuracy_score(Ytrain, train_predictions))\n", "print(\"Model test accuracy: \", accuracy_score(Ytest, test_predictions))" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9599391777296918\n" ] } ], "source": [ "clf.fit(X, Y)\n", "x_predictions = clf.predict(X)\n", "print(\"Accuracy: \", accuracy_score(Y2, x_predictions))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6873501374349377\n" ] } ], "source": [ "print(\"Accuracy: \", accuracy_score(Y3, x_predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Confusion Matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Creating a multiclass label confusion matrix and calculating different metrics from that for each individual label" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "lbls = np.unique(Y3)\n", "cm = confusion_matrix(Y3, x_predictions, labels = lbls)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 38, 0, 0, ..., 0, 0, 0],\n", " [ 0, 96, 0, ..., 0, 0, 0],\n", " [ 0, 0, 53, ..., 0, 0, 0],\n", " ...,\n", " [ 10, 207, 9, ..., 88, 195, 155],\n", " [ 0, 0, 0, ..., 0, 440, 0],\n", " [ 0, 0, 0, ..., 0, 0, 321]], dtype=int64)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cm" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "cmv = []\n", "metrics = []\n", "l = cm[0].size\n", "for i in range(l):\n", " tp = cm[i,i]\n", " fp = sum(cm[:,i])-tp\n", " fn = sum(cm[i,:])-tp\n", " tn = Y3.size-tp-fp-fn\n", " cmv.append([tp,fp,fn,tn])\n", " acc = (tp+tn)/(tp+fp+fn+tn)\n", " prec = tp/(tp+fp)\n", " rec = tp/(tp+fn)\n", " f1 = 2*((prec*rec)/(prec+rec))\n", " metrics.append([acc,prec,rec,f1])\n", "cmv = np.asarray(cmv)\n", "metrics = np.asarray(metrics)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "t1 = np.random.uniform(0.9,1,5000)\n", "t2 = np.random.uniform(0.1,0.2,1000)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "t = np.concatenate((t1, t2), axis=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drift Detection Methods Testing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is just some testing of different methods and quite messy. Would need some work to get certain parts to run" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "ddm = DDM(min_num_instances=30, warning_level=2, out_control_level=3)\n", "m = 0 #set as metric column\n", "for i in range(6000):\n", " ddm.add_element(t[i])\n", " if ddm.detected_warning_zone():\n", " print('Warning zone has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))\n", " if ddm.detected_change():\n", " print('Change has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "Y = x_predictions\n", "Ys = np.unique(x_predictions)\n", "Ys = dict(zip(Ys, range(len(Ys))))\n", "test = np.vectorize(Ys.get)(Y)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Change detected at index 1567, input value: 1\n" ] } ], "source": [ "import random\n", "from river import drift\n", "\n", "rng = random.Random(12345)\n", "adwin = drift.ADWIN()\n", "\n", "# Simulate a data stream composed by two data distributions\n", "data_stream = rng.choices([0, 1], k=1000)\n", "# Increase the probability of 1's appearing in the next 1000 instances\n", "data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])\n", "\n", "# Update drift detector and verify if change is detected\n", "for i, val in enumerate(data_stream):\n", " _ = adwin.update(val)\n", " if adwin.drift_detected:\n", " print(f\"Change detected at index {i}, input value: {val}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import random\n", "from river import drift\n", "\n", "rng = random.Random(42)\n", "ddm = drift.DDM()\n", "\n", "# Simulate a data stream where the first 1000 instances come from a uniform distribution\n", "# of 1's and 0's\n", "data_stream = rng.choices([0, 1], k=1000)\n", "# Increase the probability of 1's appearing in the next 1000 instances\n", "data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])\n", "\n", "print_warning = True\n", " # Update drift detector and verify if change is detected\n", "for i, x in enumerate(test):\n", " _ = ddm.update(x)\n", " if ddm.warning_detected and print_warning:\n", " print(f\"Warning detected at index {i}\")\n", " print_warning = False\n", " if ddm.drift_detected:\n", " print(f\"Change detected at index {i}\")\n", " print_warning = True" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "adwin = ADWIN()\n", "for i in range(l):\n", " adwin.add_element(metrics[i,m])\n", " if adwin.detected_change():\n", " print('Change detected in data: ' + str(metrics[i,m]) + ' - at index: ' + str(i))" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "from skmultiflow.drift_detection import PageHinkley\n", "ph = PageHinkley()\n", "for i in range(l):\n", " ph.add_element(metrics[i,m])\n", " if ph.detected_change():\n", " print('Change has been detected in data: ' + str(metrics[i,m]) + ' - of index: ' + str(i))" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([[3., 1., 3., 6., 1., 2., 1., 5., 2., 1., 2., 2., 3., 2., 4., 4.,\n", " 4., 1., 3., 3.]]),\n", " array([[1, 1, 1, 1, 0, 0, 0, 1, 0, 1]]))" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from skmultiflow.data.multilabel_generator import MultilabelGenerator\n", "# Setting up the stream\n", "stream = MultilabelGenerator(n_samples=100, n_features=20, n_targets=10, n_labels=10)\n", "stream.next_sample()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "lossRef = (x_predictions == Y.to_numpy()).astype(int)\n", "loss = (x_predictions == Y3.to_numpy()).astype(int)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "fetDetective = FETDrift(lossRef, p_val=0.05, alternative='less')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "loss\n", "Drift? Yes!\n", "p-value: 0.0\n" ] } ], "source": [ "losses = {'loss': loss}\n", "label = ['No!', 'Yes!']\n", "for name, lossArr in losses.items():\n", " print('\\n%s' % name)\n", " preds = fetDetective.predict(lossArr)\n", " print('Drift? {}'.format(label[preds['data']['is_drift']]))\n", " print('p-value: {}'.format(preds['data']['p_val'][0]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#import matplotlib.pyplot as plt\n", "#from sklearn.metrics import ConfusionMatrixDisplay\n", "#disp = ConfusionMatrixDisplay(confusion_matrix=cm)\n", "#disp.plot()\n", "#plt.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9634452094832361" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }