{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d8873b61-8581-493e-919e-a7b89a52dcec", "metadata": { "executionInfo": { "elapsed": 1207, "status": "ok", "timestamp": 1647798193679, "user": { "displayName": "Arjie Cristobal", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64", "userId": "08755999937279005778" }, "user_tz": 240 }, "id": "d8873b61-8581-493e-919e-a7b89a52dcec" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "id": "04fc494b-c015-4972-aa45-f8f8c861efe0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: ipynb in d:\\x-20a\\anaconda3\\lib\\site-packages (0.5.1)\n" ] } ], "source": [ "# Importing an ipynb file from another ipynb file\n", "!pip install ipynb" ] }, { "cell_type": "code", "execution_count": 3, "id": "549ec8b7-5236-47f6-8c52-02925aa1e81e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: nbimporter in d:\\x-20a\\anaconda3\\lib\\site-packages (0.3.4)\n" ] } ], "source": [ "# Importing functions from another jupyter notebook\n", "!pip install nbimporter" ] }, { "cell_type": "code", "execution_count": 4, "id": "01f4dcfb-3814-4459-95b6-71dcf5c31b38", "metadata": { "executionInfo": { "elapsed": 316, "status": "ok", "timestamp": 1647798193990, "user": { "displayName": "Arjie Cristobal", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64", "userId": "08755999937279005778" }, "user_tz": 240 }, "id": "01f4dcfb-3814-4459-95b6-71dcf5c31b38" }, "outputs": [], "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn.ensemble import ExtraTreesClassifier\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import SVC\n", "\n", "from sklearn.ensemble import IsolationForest\n", "\n", "class UrlDatasetLoader():\n", " \n", " def __init__(self):\n", " print('init Loader notebook')\n", " \n", " def load_data(self, url=\"https://raw.githubusercontent.com/quickheaven/scs-3253-machine-learning/master/datasets/ISCX-URL2016_All.csv\"): \n", " \"\"\" \n", " (string) --> dataframe\n", "\n", " This function returns the dataframe of maliciours url. \n", "\n", " Parameters\n", " ----------\n", " url: By default, it fetch the data from github otherwise a local path or url can be provided so the data can be loaded faster.\n", " \"\"\"\n", " df = pd.read_csv(url)\n", " return df\n", "\n", " def prepare_data(self, data, fill_na=True, feature_selection=True, show_graph=False):\n", " \"\"\"\n", " (DataFrame, boolean, boolean) --> X and y of the dataframe.\n", "\n", " This function returns the X and y of the malicious url dataframe.\n", "\n", " Parameters\n", " ----------\n", " fill_na : True to fill the na records with mean values otherwise drop the features.\n", "\n", " feature_selection : True to remove one or more features that have a correlation higher than 0.9 othewise do not perform that type of feature selection.\n", " https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf\n", " \n", " show_graph : True to display the graph after applying fill_na or feature_selection. \n", " \"\"\"\n", " from sklearn.preprocessing import MinMaxScaler\n", " from sklearn.preprocessing import LabelEncoder\n", " from sklearn.ensemble import IsolationForest\n", "\n", " data = data.copy()\n", "\n", " y_feature = 'URL_Type_obf_Type'\n", "\n", " if (fill_na == True):\n", " data['avgpathtokenlen'] = data['avgpathtokenlen'].fillna(data['avgpathtokenlen'].mean())\n", " data['NumberRate_DirectoryName'] = data['NumberRate_DirectoryName'].fillna(data['NumberRate_DirectoryName'].mean())\n", " data['NumberRate_FileName'] = data['NumberRate_FileName'].fillna(data['NumberRate_FileName'].mean())\n", " data['NumberRate_Extension'] = data['NumberRate_Extension'].fillna(data['NumberRate_Extension'].mean())\n", " data['NumberRate_AfterPath'] = data['NumberRate_AfterPath'].fillna(data['NumberRate_AfterPath'].mean())\n", " data['Entropy_DirectoryName'] = data['Entropy_DirectoryName'].fillna(data['Entropy_DirectoryName'].mean())\n", " data['Entropy_Filename'] = data['Entropy_Filename'].fillna(data['Entropy_Filename'].mean())\n", " data['Entropy_Extension'] = data['Entropy_Extension'].fillna(data['Entropy_Extension'].mean())\n", " data['Entropy_Afterpath'] = data['Entropy_Afterpath'].fillna(data['Entropy_Afterpath'].mean())\n", " else:\n", " data.dropna(axis='index', inplace=True)\n", "\n", " data = data.drop(\"argPathRatio\", axis=1) # simply drop this since it does not affect the scores.\n", " if (show_graph == True):\n", " plt.figure(figsize=(10, 8))\n", " ax = plt.axes()\n", " sns.heatmap(data.isnull(), ax=ax, yticklabels=False, cbar=False, cmap=\"cividis\")\n", " plt.show()\n", "\n", " le = LabelEncoder()\n", "\n", " data[y_feature] = le.fit_transform(data[y_feature]) \n", "\n", " if (feature_selection == True):\n", " corr = data.corr() \n", " # Selecting features based on correlation:\n", " # compare the correlation between features and remove one of more features that have a correlation higher than 0.9\n", " # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf\n", " columns = np.full((corr.shape[0],), True, dtype=bool)\n", " for i in range(corr.shape[0]):\n", " for j in range(i+1, corr.shape[0]):\n", " if corr.iloc[i,j] >= 0.9:\n", " if columns[j]:\n", " columns[j] = False\n", " selected_columns = data.columns[columns]\n", " data = data[selected_columns] \n", "\n", " if (show_graph == True):\n", " corr = data.corr()\n", " plt.figure(figsize=(18,15))\n", " sns.heatmap(corr, annot=True, vmin=-1.0, cmap='cividis')\n", " plt.title('Correlation Heatmap')\n", " plt.show()\n", " \n", " scaler = MinMaxScaler()\n", "\n", " X = pd.DataFrame(scaler.fit_transform(data.loc[:, data.columns != y_feature]), columns=data.columns[:-1] )\n", "\n", " y = data[y_feature]\n", " \n", " return X, y\n", " \n", "\n", " def perform_anomaly_detection(self, X, y):\n", " '''\n", " (X, y) --> X, y \n", " \n", " This function perform unsupervised anomaly detection using Isolation Forest.\n", " https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection\n", " \n", " ''' \n", " iso_forest = IsolationForest(contamination=0.01, random_state=42).fit(X)\n", " \n", " y_pred_iso_forest = iso_forest.predict(X)\n", "\n", " X_new, y_new = X[(y_pred_iso_forest != -1)], y[(y_pred_iso_forest != -1)]\n", " print('The shape after unsupervised anomaly detection:')\n", " print(X_new.shape)\n", " print(y_new.shape)\n", " \n", " return X_new, y_new\n", " \n", " \n", " def train_test_split(self, X, y, test_size, random_state, anomaly_detection=True):\n", " '''\n", " This is a convenience method to train test split and have an option to perform anomaly detection or not after the split.\n", " \n", " Read more in sklearn.model_selection.train_test_split\n", " \n", " Parameters\n", " ----------\n", " anomaly_detection: True to perform unsupervised anomaly detection using Isolation Forest.\n", " '''\n", " from sklearn.model_selection import train_test_split\n", "\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)\n", " \n", " print('The X_train, y_train shape:')\n", " print(X_train.shape)\n", " print(y_train.shape)\n", " if anomaly_detection == True:\n", " X_train, y_train = self.perform_anomaly_detection(X_train, y_train)\n", "\n", " print('The X_test, y_test shape:')\n", " print(X_test.shape)\n", " print(y_test.shape)\n", " if anomaly_detection == True: \n", " X_test, y_test = self.perform_anomaly_detection(X_test, y_test)\n", " \n", " return X_train, X_test, y_train, y_test \n", " \n", " \n", " def get_models_to_train(self):\n", " '''\n", " (None) --> dict\n", "\n", " This function returns the models to be tested.\n", "\n", " '''\n", " RANDOM_STATE=42\n", " MAX_ITER=1000\n", " \n", " models = dict()\n", " models['RandomForestClassifier'] = RandomForestClassifier(random_state=RANDOM_STATE)\n", " models['DecisionTreeClassifier'] = DecisionTreeClassifier(random_state=RANDOM_STATE) \n", " models['LogisticRegression'] = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)\n", " models['AdaBoostClassifier'] = AdaBoostClassifier(random_state=RANDOM_STATE)\n", " models['MultinomialNB'] = MultinomialNB()\n", " \n", " #drop-out from selection\n", " #models['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=RANDOM_STATE)\n", " #models['KNeighborsClassifier'] = KNeighborsClassifier()\n", " #models['SupportVectorMachine'] = SVC(random_state=RANDOM_STATE)\n", "\n", " return models\n", " \n", " \n", " def get_parameters_to_train(self, is_best_params=False):\n", " '''\n", " (Boolean) --> dict\n", "\n", " This function returns the params to be use for model testing.\n", "\n", " Parameters\n", " ----------\n", " is_best_params : True to use the already defined best params from previous runs. \n", " False to use all the possible hyperparameters.\n", "\n", " Sources:\n", " https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms\n", " https://medium.com/swlh/the-hyperparameter-cheat-sheet-770f1fed32ff\n", " https://medium.com/@chaudhurysrijani/tuning-of-adaboost-with-computational-complexity-8727d01a9d20 \n", " '''\n", "\n", " params_knn = dict()\n", " params_tre = dict()\n", " params_ran = dict()\n", " params_gra = dict()\n", "\n", " params_log = dict()\n", " params_svc = dict()\n", "\n", " params_ada = dict()\n", " params_mnb = dict()\n", "\n", " if is_best_params == False:\n", "\n", " params_knn['n_neighbors'] = [2, 4, 6]\n", " params_knn['weights'] = ['uniform','distance']\n", " params_knn['metric'] = ['minkowski','euclidean','manhattan']\n", "\n", " params_tre['criterion'] = ['gini', 'entropy']\n", " params_tre['max_depth'] = [1, 3, 5, 10]\n", " params_tre['min_samples_split'] = [5, 10] \n", " params_tre['min_samples_leaf'] = [5, 10] \n", "\n", " params_ran['criterion'] = ['gini', 'entropy']\n", " params_ran['n_estimators'] = [100, 150, 200]\n", " params_ran['max_depth'] = [1, 3, 5, 10]\n", " params_ran['min_samples_split'] = [5, 10] \n", " params_ran['min_samples_leaf'] = [5, 10] \n", "\n", " params_gra['learning_rate'] = [0.001, 0.01, 0.1]\n", " # params_gra['n_estimators'] = [100, 1000] no significant impact\n", " params_gra['subsample'] = [0.5, 0.7, 1.0]\n", " params_gra['max_depth'] = [3, 7, 9]\n", "\n", " # https://www.kaggle.com/code/satishgunjal/multiclass-logistic-regression-using-sklearn/notebook\n", " # Since we are going to use One Vs Rest algorithm, set > multi_class='ovr'\n", " # Note: since we are using One Vs Rest algorithm we must use 'liblinear' solver with it. \n", " params_log['multi_class'] = ['ovr']\n", " params_log['solver'] = ['liblinear']\n", " params_log['penalty'] = ['l2']\n", " params_log['C'] = [100, 10, 1.0, 0.1] \n", "\n", " # https://www.baeldung.com/cs/svm-multiclass-classification\n", " #params_svc['kernel'] = ['rbf'] \n", " #params_svc['gamma'] = [0.1, 0.5, 1.0]\n", " #params_svc['C'] = [0.01, 0.1]\n", "\n", " params_ada['learning_rate'] = [0.01, 0.1, 1.0]\n", " params_ada['algorithm'] = ['SAMME', 'SAMME.R']\n", " \n", " params_mnb['alpha']=[0.50, 1.0, 2.0] \n", " params_mnb['class_prior']=[None] \n", " params_mnb['fit_prior']=[True, False]\n", "\n", " else:\n", " params_knn['n_neighbors'] = [2]\n", " params_knn['weights'] = ['distance']\n", " params_knn['metric'] = ['manhattan']\n", "\n", " params_tre['criterion'] = ['entropy']\n", " params_tre['max_depth'] = [10] \n", " params_tre['min_samples_leaf'] = [5] \n", " params_tre['min_samples_split'] = [5] \n", "\n", " params_ran['criterion'] = ['entropy']\n", " params_ran['n_estimators'] = [200]\n", " params_ran['max_depth'] = [10] \n", " params_ran['min_samples_leaf'] = [5] \n", " params_ran['min_samples_split'] = [5]\n", " \n", " params_gra['learning_rate'] = [0.1]\n", " params_gra['subsample'] = [0.7]\n", " params_gra['max_depth'] = [9] \n", "\n", " params_log['multi_class'] = ['ovr']\n", " params_log['solver'] = ['liblinear']\n", " params_log['penalty'] = ['l2']\n", " params_log['C'] = [100] \n", "\n", " # https://www.baeldung.com/cs/svm-multiclass-classification\n", " params_svc['kernel'] = ['rbf'] \n", " params_svc['gamma'] = [1.0]\n", " params_svc['C'] = [0.1]\n", "\n", " params_ada['learning_rate'] = [1.0]\n", " params_ada['algorithm'] = ['SAMME']\n", " \n", " params_mnb['alpha']=[1.0] \n", " params_mnb['class_prior']=[None] \n", " params_mnb['fit_prior']=[True]\n", "\n", " params = dict()\n", " params['KNeighborsClassifier'] = params_knn\n", " params['DecisionTreeClassifier'] = params_tre\n", " params['RandomForestClassifier'] = params_ran\n", " params['GradientBoostingClassifier'] = params_gra\n", "\n", " params['LogisticRegression'] = params_log\n", " #params['SupportVectorMachine'] = params_svc\n", "\n", " params['AdaBoostClassifier'] = params_ada\n", " params['MultinomialNB'] = params_mnb\n", "\n", "\n", " return params " ] }, { "cell_type": "markdown", "id": "34cf35ea-305d-4a72-b55b-5ad88f839351", "metadata": {}, "source": [ "#### Test the loader notebook and class" ] }, { "cell_type": "code", "execution_count": 5, "id": "e7ca79b9-a3e0-42c7-8eca-f3a45f9cd301", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 372 }, "executionInfo": { "elapsed": 238, "status": "error", "timestamp": 1647798194224, "user": { "displayName": "Arjie Cristobal", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64", "userId": "08755999937279005778" }, "user_tz": 240 }, "id": "e7ca79b9-a3e0-42c7-8eca-f3a45f9cd301", "outputId": "cba28743-6c02-414a-9ae0-d14b011c3714" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "init Loader notebook\n" ] } ], "source": [ "import nbimporter\n", "import loader_nb\n", "\n", "loader = loader_nb.UrlDatasetLoader()" ] }, { "cell_type": "code", "execution_count": 6, "id": "b34b2c20-67ec-40e1-a006-2db027476ce7", "metadata": { "executionInfo": { "elapsed": 8, "status": "aborted", "timestamp": 1647798194220, "user": { "displayName": "Arjie Cristobal", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64", "userId": "08755999937279005778" }, "user_tz": 240 }, "id": "b34b2c20-67ec-40e1-a006-2db027476ce7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(36707, 80)\n" ] } ], "source": [ "df = loader.load_data()\n", "print(df.shape)" ] }, { "cell_type": "code", "execution_count": 7, "id": "9bdf22de-e706-4348-b227-588081e2b7ec", "metadata": { "executionInfo": { "elapsed": 9, "status": "aborted", "timestamp": 1647798194221, "user": { "displayName": "Arjie Cristobal", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64", "userId": "08755999937279005778" }, "user_tz": 240 }, "id": "9bdf22de-e706-4348-b227-588081e2b7ec" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(36707, 51)\n", "(36707,)\n" ] } ], "source": [ "X, y = loader.prepare_data(df)\n", "print(X.shape)\n", "print(y.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "18a264f6-8a7b-499a-9222-90c3993603ab", "metadata": { "executionInfo": { "elapsed": 9, "status": "aborted", "timestamp": 1647798194222, "user": { "displayName": "Arjie Cristobal", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GihC545mgquPZtV8FArv-dQnp6CN4zsac4wJXu3Bu8=s64", "userId": "08755999937279005778" }, "user_tz": 240 }, "id": "18a264f6-8a7b-499a-9222-90c3993603ab" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "loader_nb.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }