{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d8873b61-8581-493e-919e-a7b89a52dcec",
   "metadata": {
    "executionInfo": {
     "elapsed": 1207,
     "status": "ok",
     "timestamp": 1647798193679,
     "user": {
      "displayName": "Arjie Cristobal",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64",
      "userId": "08755999937279005778"
     },
     "user_tz": 240
    },
    "id": "d8873b61-8581-493e-919e-a7b89a52dcec"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "04fc494b-c015-4972-aa45-f8f8c861efe0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: ipynb in d:\\x-20a\\anaconda3\\lib\\site-packages (0.5.1)\n"
     ]
    }
   ],
   "source": [
    "# Importing an ipynb file from another ipynb file\n",
    "!pip install ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "549ec8b7-5236-47f6-8c52-02925aa1e81e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: nbimporter in d:\\x-20a\\anaconda3\\lib\\site-packages (0.3.4)\n"
     ]
    }
   ],
   "source": [
    "# Importing functions from another jupyter notebook\n",
    "!pip install nbimporter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "01f4dcfb-3814-4459-95b6-71dcf5c31b38",
   "metadata": {
    "executionInfo": {
     "elapsed": 316,
     "status": "ok",
     "timestamp": 1647798193990,
     "user": {
      "displayName": "Arjie Cristobal",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64",
      "userId": "08755999937279005778"
     },
     "user_tz": 240
    },
    "id": "01f4dcfb-3814-4459-95b6-71dcf5c31b38"
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "from sklearn.ensemble import IsolationForest\n",
    "\n",
    "class UrlDatasetLoader():\n",
    "    \n",
    "    def __init__(self):\n",
    "        print('init Loader notebook')\n",
    "    \n",
    "    def load_data(self, url=\"https://raw.githubusercontent.com/quickheaven/scs-3253-machine-learning/master/datasets/ISCX-URL2016_All.csv\"):     \n",
    "        \"\"\"    \n",
    "        (string) --> dataframe\n",
    "\n",
    "        This function returns the dataframe of maliciours url.    \n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        url: By default, it fetch the data from github otherwise a local path or url can be provided so the data can be loaded faster.\n",
    "        \"\"\"\n",
    "        df = pd.read_csv(url)\n",
    "        return df\n",
    "\n",
    "    def prepare_data(self, data, fill_na=True, feature_selection=True, show_graph=False):\n",
    "        \"\"\"\n",
    "        (DataFrame, boolean, boolean) --> X and y of the dataframe.\n",
    "\n",
    "        This function returns the X and y of the malicious url dataframe.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        fill_na : True to fill the na records with mean values otherwise drop the features.\n",
    "\n",
    "        feature_selection : True to remove one or more features that have a correlation higher than 0.9 othewise do not perform that type of feature selection.\n",
    "                            https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf\n",
    "                            \n",
    "        show_graph : True to display the graph after applying fill_na or feature_selection.                    \n",
    "        \"\"\"\n",
    "        from sklearn.preprocessing import MinMaxScaler\n",
    "        from sklearn.preprocessing import LabelEncoder\n",
    "        from sklearn.ensemble import IsolationForest\n",
    "\n",
    "        data = data.copy()\n",
    "\n",
    "        y_feature = 'URL_Type_obf_Type'\n",
    "\n",
    "        if (fill_na == True):\n",
    "            data['avgpathtokenlen'] = data['avgpathtokenlen'].fillna(data['avgpathtokenlen'].mean())\n",
    "            data['NumberRate_DirectoryName'] = data['NumberRate_DirectoryName'].fillna(data['NumberRate_DirectoryName'].mean())\n",
    "            data['NumberRate_FileName'] = data['NumberRate_FileName'].fillna(data['NumberRate_FileName'].mean())\n",
    "            data['NumberRate_Extension'] = data['NumberRate_Extension'].fillna(data['NumberRate_Extension'].mean())\n",
    "            data['NumberRate_AfterPath'] = data['NumberRate_AfterPath'].fillna(data['NumberRate_AfterPath'].mean())\n",
    "            data['Entropy_DirectoryName'] = data['Entropy_DirectoryName'].fillna(data['Entropy_DirectoryName'].mean())\n",
    "            data['Entropy_Filename'] = data['Entropy_Filename'].fillna(data['Entropy_Filename'].mean())\n",
    "            data['Entropy_Extension'] = data['Entropy_Extension'].fillna(data['Entropy_Extension'].mean())\n",
    "            data['Entropy_Afterpath'] = data['Entropy_Afterpath'].fillna(data['Entropy_Afterpath'].mean())\n",
    "        else:\n",
    "            data.dropna(axis='index', inplace=True)\n",
    "\n",
    "        data = data.drop(\"argPathRatio\", axis=1) # simply drop this since it does not affect the scores.\n",
    "        if (show_graph == True):\n",
    "            plt.figure(figsize=(10, 8))\n",
    "            ax = plt.axes()\n",
    "            sns.heatmap(data.isnull(), ax=ax, yticklabels=False, cbar=False, cmap=\"cividis\")\n",
    "            plt.show()\n",
    "\n",
    "        le = LabelEncoder()\n",
    "\n",
    "        data[y_feature] = le.fit_transform(data[y_feature]) \n",
    "\n",
    "        if (feature_selection == True):\n",
    "            corr = data.corr()        \n",
    "            # Selecting features based on correlation:\n",
    "            # compare the correlation between features and remove one of more features that have a correlation higher than 0.9\n",
    "            # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf\n",
    "            columns = np.full((corr.shape[0],), True, dtype=bool)\n",
    "            for i in range(corr.shape[0]):\n",
    "                for j in range(i+1, corr.shape[0]):\n",
    "                    if corr.iloc[i,j] >= 0.9:\n",
    "                        if columns[j]:\n",
    "                            columns[j] = False\n",
    "            selected_columns = data.columns[columns]\n",
    "            data = data[selected_columns]            \n",
    "\n",
    "        if (show_graph == True):\n",
    "            corr = data.corr()\n",
    "            plt.figure(figsize=(18,15))\n",
    "            sns.heatmap(corr, annot=True, vmin=-1.0, cmap='cividis')\n",
    "            plt.title('Correlation Heatmap')\n",
    "            plt.show()\n",
    "            \n",
    "        scaler = MinMaxScaler()\n",
    "\n",
    "        X = pd.DataFrame(scaler.fit_transform(data.loc[:, data.columns != y_feature]), columns=data.columns[:-1] )\n",
    "\n",
    "        y = data[y_feature]\n",
    "            \n",
    "        return X, y\n",
    "    \n",
    "\n",
    "    def perform_anomaly_detection(self, X, y):\n",
    "        '''\n",
    "        (X, y) --> X, y \n",
    "        \n",
    "        This function perform unsupervised anomaly detection using Isolation Forest.\n",
    "        https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection\n",
    "        \n",
    "        '''        \n",
    "        iso_forest = IsolationForest(contamination=0.01, random_state=42).fit(X)\n",
    "        \n",
    "        y_pred_iso_forest = iso_forest.predict(X)\n",
    "\n",
    "        X_new, y_new = X[(y_pred_iso_forest != -1)], y[(y_pred_iso_forest != -1)]\n",
    "        print('The shape after unsupervised anomaly detection:')\n",
    "        print(X_new.shape)\n",
    "        print(y_new.shape)\n",
    "        \n",
    "        return X_new, y_new\n",
    "        \n",
    "    \n",
    "    def train_test_split(self, X, y, test_size, random_state, anomaly_detection=True):\n",
    "        '''\n",
    "        This is a convenience method to train test split and have an option to perform anomaly detection or not after the split.\n",
    "        \n",
    "        Read more in sklearn.model_selection.train_test_split\n",
    "        \n",
    "        Parameters\n",
    "        ----------\n",
    "        anomaly_detection: True to perform unsupervised anomaly detection using Isolation Forest.\n",
    "        '''\n",
    "        from sklearn.model_selection import train_test_split\n",
    "\n",
    "        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)\n",
    "                   \n",
    "        print('The X_train, y_train shape:')\n",
    "        print(X_train.shape)\n",
    "        print(y_train.shape)\n",
    "        if anomaly_detection == True:\n",
    "            X_train, y_train = self.perform_anomaly_detection(X_train, y_train)\n",
    "\n",
    "        print('The X_test, y_test shape:')\n",
    "        print(X_test.shape)\n",
    "        print(y_test.shape)\n",
    "        if anomaly_detection == True:            \n",
    "            X_test, y_test = self.perform_anomaly_detection(X_test, y_test)\n",
    "                    \n",
    "        return X_train, X_test, y_train, y_test           \n",
    "        \n",
    "    \n",
    "    def get_models_to_train(self):\n",
    "        '''\n",
    "        (None) --> dict\n",
    "\n",
    "        This function returns the models to be tested.\n",
    "\n",
    "        '''\n",
    "        RANDOM_STATE=42\n",
    "        MAX_ITER=1000\n",
    "        \n",
    "        models = dict()\n",
    "        models['RandomForestClassifier'] = RandomForestClassifier(random_state=RANDOM_STATE)\n",
    "        models['DecisionTreeClassifier'] = DecisionTreeClassifier(random_state=RANDOM_STATE)    \n",
    "        models['LogisticRegression'] = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)\n",
    "        models['AdaBoostClassifier'] = AdaBoostClassifier(random_state=RANDOM_STATE)\n",
    "        models['MultinomialNB'] = MultinomialNB()\n",
    "        \n",
    "        #drop-out from selection\n",
    "        #models['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=RANDOM_STATE)\n",
    "        #models['KNeighborsClassifier'] = KNeighborsClassifier()\n",
    "        #models['SupportVectorMachine'] = SVC(random_state=RANDOM_STATE)\n",
    "\n",
    "        return models\n",
    "    \n",
    "    \n",
    "    def get_parameters_to_train(self, is_best_params=False):\n",
    "        '''\n",
    "        (Boolean) --> dict\n",
    "\n",
    "        This function returns the params to be use for model testing.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        is_best_params : True to use the already defined best params from previous runs. \n",
    "                         False to use all the possible hyperparameters.\n",
    "\n",
    "        Sources:\n",
    "        https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms\n",
    "        https://medium.com/swlh/the-hyperparameter-cheat-sheet-770f1fed32ff\n",
    "        https://medium.com/@chaudhurysrijani/tuning-of-adaboost-with-computational-complexity-8727d01a9d20    \n",
    "        '''\n",
    "\n",
    "        params_knn = dict()\n",
    "        params_tre = dict()\n",
    "        params_ran = dict()\n",
    "        params_gra = dict()\n",
    "\n",
    "        params_log = dict()\n",
    "        params_svc = dict()\n",
    "\n",
    "        params_ada = dict()\n",
    "        params_mnb = dict()\n",
    "\n",
    "        if is_best_params == False:\n",
    "\n",
    "            params_knn['n_neighbors'] = [2, 4, 6]\n",
    "            params_knn['weights'] = ['uniform','distance']\n",
    "            params_knn['metric'] = ['minkowski','euclidean','manhattan']\n",
    "\n",
    "            params_tre['criterion'] = ['gini', 'entropy']\n",
    "            params_tre['max_depth'] = [1, 3, 5, 10]\n",
    "            params_tre['min_samples_split'] = [5, 10] \n",
    "            params_tre['min_samples_leaf'] = [5, 10]          \n",
    "\n",
    "            params_ran['criterion'] = ['gini', 'entropy']\n",
    "            params_ran['n_estimators'] = [100, 150, 200]\n",
    "            params_ran['max_depth'] = [1, 3, 5, 10]\n",
    "            params_ran['min_samples_split'] = [5, 10] \n",
    "            params_ran['min_samples_leaf'] = [5, 10]         \n",
    "\n",
    "            params_gra['learning_rate'] = [0.001, 0.01, 0.1]\n",
    "            # params_gra['n_estimators'] = [100, 1000]  no significant impact\n",
    "            params_gra['subsample'] = [0.5, 0.7, 1.0]\n",
    "            params_gra['max_depth'] = [3, 7, 9]\n",
    "\n",
    "            # https://www.kaggle.com/code/satishgunjal/multiclass-logistic-regression-using-sklearn/notebook\n",
    "            # Since we are going to use One Vs Rest algorithm, set > multi_class='ovr'\n",
    "            # Note: since we are using One Vs Rest algorithm we must use 'liblinear' solver with it.        \n",
    "            params_log['multi_class'] = ['ovr']\n",
    "            params_log['solver'] = ['liblinear']\n",
    "            params_log['penalty'] = ['l2']\n",
    "            params_log['C'] = [100, 10, 1.0, 0.1]        \n",
    "\n",
    "            # https://www.baeldung.com/cs/svm-multiclass-classification\n",
    "            #params_svc['kernel'] = ['rbf']        \n",
    "            #params_svc['gamma'] = [0.1, 0.5, 1.0]\n",
    "            #params_svc['C'] = [0.01, 0.1]\n",
    "\n",
    "            params_ada['learning_rate'] = [0.01, 0.1, 1.0]\n",
    "            params_ada['algorithm'] = ['SAMME', 'SAMME.R']\n",
    "            \n",
    "            params_mnb['alpha']=[0.50, 1.0, 2.0] \n",
    "            params_mnb['class_prior']=[None] \n",
    "            params_mnb['fit_prior']=[True, False]\n",
    "\n",
    "        else:\n",
    "            params_knn['n_neighbors'] = [2]\n",
    "            params_knn['weights'] = ['distance']\n",
    "            params_knn['metric'] = ['manhattan']\n",
    "\n",
    "            params_tre['criterion'] = ['entropy']\n",
    "            params_tre['max_depth'] = [10]         \n",
    "            params_tre['min_samples_leaf'] = [5]  \n",
    "            params_tre['min_samples_split'] = [5] \n",
    "\n",
    "            params_ran['criterion'] = ['entropy']\n",
    "            params_ran['n_estimators'] = [200]\n",
    "            params_ran['max_depth'] = [10]         \n",
    "            params_ran['min_samples_leaf'] = [5]  \n",
    "            params_ran['min_samples_split'] = [5]\n",
    "       \n",
    "            params_gra['learning_rate'] = [0.1]\n",
    "            params_gra['subsample'] = [0.7]\n",
    "            params_gra['max_depth'] = [9]        \n",
    "\n",
    "            params_log['multi_class'] = ['ovr']\n",
    "            params_log['solver'] = ['liblinear']\n",
    "            params_log['penalty'] = ['l2']\n",
    "            params_log['C'] = [100]  \n",
    "\n",
    "            # https://www.baeldung.com/cs/svm-multiclass-classification\n",
    "            params_svc['kernel'] = ['rbf']        \n",
    "            params_svc['gamma'] = [1.0]\n",
    "            params_svc['C'] = [0.1]\n",
    "\n",
    "            params_ada['learning_rate'] = [1.0]\n",
    "            params_ada['algorithm'] = ['SAMME']\n",
    "            \n",
    "            params_mnb['alpha']=[1.0] \n",
    "            params_mnb['class_prior']=[None] \n",
    "            params_mnb['fit_prior']=[True]\n",
    "\n",
    "        params = dict()\n",
    "        params['KNeighborsClassifier'] = params_knn\n",
    "        params['DecisionTreeClassifier'] = params_tre\n",
    "        params['RandomForestClassifier'] = params_ran\n",
    "        params['GradientBoostingClassifier'] = params_gra\n",
    "\n",
    "        params['LogisticRegression'] = params_log\n",
    "        #params['SupportVectorMachine'] = params_svc\n",
    "\n",
    "        params['AdaBoostClassifier'] = params_ada\n",
    "        params['MultinomialNB'] = params_mnb\n",
    "\n",
    "\n",
    "        return params        "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34cf35ea-305d-4a72-b55b-5ad88f839351",
   "metadata": {},
   "source": [
    "#### Test the loader notebook and class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e7ca79b9-a3e0-42c7-8eca-f3a45f9cd301",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 372
    },
    "executionInfo": {
     "elapsed": 238,
     "status": "error",
     "timestamp": 1647798194224,
     "user": {
      "displayName": "Arjie Cristobal",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64",
      "userId": "08755999937279005778"
     },
     "user_tz": 240
    },
    "id": "e7ca79b9-a3e0-42c7-8eca-f3a45f9cd301",
    "outputId": "cba28743-6c02-414a-9ae0-d14b011c3714"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "init Loader notebook\n"
     ]
    }
   ],
   "source": [
    "import nbimporter\n",
    "import loader_nb\n",
    "\n",
    "loader = loader_nb.UrlDatasetLoader()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b34b2c20-67ec-40e1-a006-2db027476ce7",
   "metadata": {
    "executionInfo": {
     "elapsed": 8,
     "status": "aborted",
     "timestamp": 1647798194220,
     "user": {
      "displayName": "Arjie Cristobal",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64",
      "userId": "08755999937279005778"
     },
     "user_tz": 240
    },
    "id": "b34b2c20-67ec-40e1-a006-2db027476ce7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(36707, 80)\n"
     ]
    }
   ],
   "source": [
    "df = loader.load_data()\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9bdf22de-e706-4348-b227-588081e2b7ec",
   "metadata": {
    "executionInfo": {
     "elapsed": 9,
     "status": "aborted",
     "timestamp": 1647798194221,
     "user": {
      "displayName": "Arjie Cristobal",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64",
      "userId": "08755999937279005778"
     },
     "user_tz": 240
    },
    "id": "9bdf22de-e706-4348-b227-588081e2b7ec"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(36707, 51)\n",
      "(36707,)\n"
     ]
    }
   ],
   "source": [
    "X, y = loader.prepare_data(df)\n",
    "print(X.shape)\n",
    "print(y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18a264f6-8a7b-499a-9222-90c3993603ab",
   "metadata": {
    "executionInfo": {
     "elapsed": 9,
     "status": "aborted",
     "timestamp": 1647798194222,
     "user": {
      "displayName": "Arjie Cristobal",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64",
      "userId": "08755999937279005778"
     },
     "user_tz": 240
    },
    "id": "18a264f6-8a7b-499a-9222-90c3993603ab"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "name": "loader_nb.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}