{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# -*- coding: utf-8 -*-\n", "import scipy.misc as mpimg\n", "\n", "import numpy as np\n", "\n", "import time\n", "import datetime\n", "import csv\n", "\n", "from sklearn import svm\n", "\n", "import numpy.ma as ma\n", "\n", "import argparse\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.externals import joblib\n", "import os.path\n", "import tifffile as tiff\n", "import numpy as np\n", "import pandas as pd\n", "# Visualization \n", "import matplotlib.pyplot as plt\n", "from pandas.tools.plotting import scatter_matrix\n", "\n", "# Feature Selection and Encoding\n", "from sklearn.feature_selection import RFE, RFECV\n", "from sklearn.svm import SVR\n", "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize\n", "\n", "# Machine learning \n", "import sklearn.ensemble as ske\n", "from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model\n", "from sklearn.svm import LinearSVC\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "import tensorflow as tf\n", "\n", "# Grid and Random Search\n", "import scipy.stats as st\n", "from scipy.stats import randint as sp_randint\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "# Metrics\n", "from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc\n", "\n", "# Managing Warnings \n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def readdata():\n", " Path = \"/mnt/datapool/RemoteSensingData/S1_DV/radasat2/r2_zip/NewFolder/GF3/\"\n", " gt=tiff.imread(Path+\"gt.tif\")\n", " img=tiff.imread(Path+\"img.tif\")\n", " hh=img[:,:,0]\n", " hv=img[:,:,1]\n", " datah=hh.reshape(-1)\n", " datav=hv.reshape(-1) \n", " label=gt.reshape(-1) \n", " \n", " datah=datah[np.where(label<>255)]\n", " datav=datav[np.where(label<>255)]\n", " data=np.vstack(np.stack((datah, datav), axis=-1))\n", " label=label[label<>255]# 255 NonData\n", " #[l.index(l) for i in l if l == 0]\n", " #print(datah[:100])\n", " #print(datah.shape)\n", " print(label.shape)\n", " print(data.shape)\n", " return(data,label)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X,y=readdata()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(286066, 2)\n", "(286066,)\n" ] } ], "source": [ "X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.3, random_state=0)\n", "print (X_train.shape)\n", "print(y_train.shape)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def plot_roc_curve(y_test, preds):\n", " fpr, tpr, threshold = metrics.roc_curve(y_test, preds)\n", " roc_auc = metrics.auc(fpr, tpr)\n", " plt.title('Receiver Operating Characteristic')\n", " plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\n", " plt.legend(loc = 'lower right')\n", " plt.plot([0, 1], [0, 1],'r--')\n", " plt.xlim([-0.01, 1.01])\n", " plt.ylim([-0.01, 1.01])\n", " plt.ylabel('True Positive Rate')\n", " plt.xlabel('False Positive Rate')\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def fit_ml_algo(algo, X_train, y_train, X_test, cv):\n", " # One Pass\n", " model = algo.fit(X_train, y_train)\n", " test_pred = model.predict(X_test)\n", " if (isinstance(algo, (LogisticRegression, \n", " KNeighborsClassifier, \n", " GaussianNB, \n", " DecisionTreeClassifier, \n", " RandomForestClassifier,\n", " GradientBoostingClassifier))):\n", " probs = model.predict_proba(X_test)[:,1]\n", " else:\n", " probs = \"Not Available\"\n", " acc = round(model.score(X_test, y_test) * 100, 2) \n", " # CV \n", " train_pred = model_selection.cross_val_predict(algo, \n", " X_train, \n", " y_train, \n", " cv=cv, \n", " n_jobs = -1)\n", " acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)\n", " return train_pred, test_pred, acc, acc_cv, probs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomizedSearchCV took 23.10 seconds for 10 candidates parameter settings.\n", "Model with rank: 1\n", "Mean validation score: 0.943 (std: 0.001)\n", "Parameters: {'penalty': 'l1', 'C': 0.18110306173696183, 'intercept_scaling': 1.0698008294791242e-07, 'class_weight': 'balanced'}\n", "\n", "Model with rank: 2\n", "Mean validation score: 0.940 (std: 0.001)\n", "Parameters: {'penalty': 'l2', 'C': 61727.551134160356, 'intercept_scaling': 1.9816625542394187e-10, 'class_weight': 'balanced'}\n", "\n", "Model with rank: 3\n", "Mean validation score: 0.940 (std: 0.000)\n", "Parameters: {'penalty': 'l1', 'C': 6.134494449932439e+17, 'intercept_scaling': 3.6387824850747874e-11, 'class_weight': 'balanced'}\n", "\n", "Model with rank: 4\n", "Mean validation score: 0.930 (std: 0.000)\n", "Parameters: {'penalty': 'l2', 'C': 8452646356.7554703, 'intercept_scaling': 0.011529386030693178, 'class_weight': None}\n", "\n", "Model with rank: 5\n", "Mean validation score: 0.930 (std: 0.000)\n", "Parameters: {'penalty': 'l2', 'C': 289.89867115042091, 'intercept_scaling': 1.600370075165793e-18, 'class_weight': None}\n", "\n" ] } ], "source": [ "# Logistic Regression - Random Search for Hyperparameters\n", "\n", "# Utility function to report best scores\n", "def report(results, n_top=5):\n", " for i in range(1, n_top + 1):\n", " candidates = np.flatnonzero(results['rank_test_score'] == i)\n", " for candidate in candidates:\n", " print(\"Model with rank: {0}\".format(i))\n", " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", " results['mean_test_score'][candidate],\n", " results['std_test_score'][candidate]))\n", " print(\"Parameters: {0}\".format(results['params'][candidate]))\n", " print(\"\")\n", " \n", "# Specify parameters and distributions to sample from\n", "param_dist = {'penalty': ['l2', 'l1'], \n", " 'class_weight': [None, 'balanced'],\n", " 'C': np.logspace(-20, 20, 10000), \n", " 'intercept_scaling': np.logspace(-20, 20, 10000)}\n", "\n", "# Run Randomized Search\n", "n_iter_search = 10\n", "lrc = LogisticRegression()\n", "random_search = RandomizedSearchCV(lrc, \n", " n_jobs=-1, \n", " param_distributions=param_dist, \n", " n_iter=n_iter_search)\n", "\n", "start = time.time()\n", "random_search.fit(X_train, y_train)\n", "print(\"RandomizedSearchCV took %.2f seconds for %d candidates\"\n", " \" parameter settings.\" % ((time.time() - start), n_iter_search))\n", "report(random_search.cv_results_)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 93.41\n", "Accuracy CV 10-Fold: 93.22\n", "Running Time: 0:00:16.047096\n" ] } ], "source": [ "# Logistic Regression\n", "start_time = time.time()\n", "train_pred_log, test_pred_log, acc_log, acc_cv_log, probs_log = fit_ml_algo(LogisticRegression(n_jobs = -1), \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "log_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_log)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_log)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=log_time))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 94.29\n", "Accuracy CV 10-Fold: 94.22\n", "Running Time: 0:02:51.263258\n" ] } ], "source": [ "#k-Nearest Neighbors\n", "start_time = time.time()\n", "train_pred_knn, test_pred_knn, acc_knn, acc_cv_knn, probs_knn = fit_ml_algo(KNeighborsClassifier(n_neighbors = 3,\n", " n_jobs = -1), \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "knn_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_knn)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_knn)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=knn_time))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 93.26\n", "Accuracy CV 10-Fold: 93.07\n", "Running Time: 0:00:00.964405\n" ] } ], "source": [ "# Gaussian Naive Bayes\n", "start_time = time.time()\n", "train_pred_gaussian, test_pred_gaussian, acc_gaussian, acc_cv_gaussian, probs_gau = fit_ml_algo(GaussianNB(), \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "gaussian_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_gaussian)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_gaussian)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=gaussian_time))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 92.33\n", "Accuracy CV 10-Fold: 91.66\n", "Running Time: 0:09:26.554808\n" ] } ], "source": [ "# Linear SVC\n", "start_time = time.time()\n", "train_pred_svc, test_pred_svc, acc_linear_svc, acc_cv_linear_svc, _ = fit_ml_algo(LinearSVC(),\n", " X_train, \n", " y_train,\n", " X_test, \n", " 10)\n", "linear_svc_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_linear_svc)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_linear_svc)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=linear_svc_time))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 91.99\n", "Accuracy CV 10-Fold: 91.51\n", "Running Time: 0:00:02.301628\n" ] } ], "source": [ "# Stochastic Gradient Descent\n", "start_time = time.time()\n", "train_pred_sgd, test_pred_sgd, acc_sgd, acc_cv_sgd, _ = fit_ml_algo(SGDClassifier(n_jobs = -1), \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "sgd_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_sgd)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_sgd)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=sgd_time))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 96.84\n", "Accuracy CV 10-Fold: 96.34\n", "Running Time: 0:00:03.184184\n" ] } ], "source": [ "# Decision Tree Classifier\n", "start_time = time.time()\n", "train_pred_dt, test_pred_dt, acc_dt, acc_cv_dt, probs_dt = fit_ml_algo(DecisionTreeClassifier(), \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "dt_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_dt)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_dt)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=dt_time))\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 95.06\n", "Accuracy CV 10-Fold: 94.85\n", "Running Time: 0:00:17.790645\n" ] } ], "source": [ "# Random Forest Classifier\n", "start_time = time.time()\n", "rfc = RandomForestClassifier(n_estimators=10, \n", " min_samples_leaf=2,\n", " min_samples_split=17, \n", " criterion='gini', \n", " max_features=2)\n", "train_pred_rf, test_pred_rf, acc_rf, acc_cv_rf, probs_rf = fit_ml_algo(rfc, \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "rf_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_rf)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_rf)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=rf_time))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 95.02\n", "Accuracy CV 10-Fold: 94.88\n", "Running Time: 0:05:33.582986\n" ] } ], "source": [ "# Gradient Boosting Trees\n", "start_time = time.time()\n", "train_pred_gbt, test_pred_gbt, acc_gbt, acc_cv_gbt, probs_gbt = fit_ml_algo(GradientBoostingClassifier(), \n", " X_train, \n", " y_train, \n", " X_test, \n", " 10)\n", "gbt_time = (time.time() - start_time)\n", "print(\"Accuracy: %s\" % acc_gbt)\n", "print(\"Accuracy CV 10-Fold: %s\" % acc_cv_gbt)\n", "print(\"Running Time: %s\" % datetime.timedelta(seconds=gbt_time))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LR: 0.904087 (0.108555)\n", "LDA: 0.912762 (0.093905)\n", "KNN: 0.916538 (0.086706)\n", "CART: 0.908201 (0.082164)\n", "NB: 0.902763 (0.106315)\n" ] } ], "source": [ "from sklearn import model_selection\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.svm import SVC\n", "seed = 7\n", "models = []\n", "\n", "models.append(('LR', LogisticRegression()))\n", "models.append(('LDA', LinearDiscriminantAnalysis()))\n", "models.append(('KNN', KNeighborsClassifier()))\n", "models.append(('CART', DecisionTreeClassifier()))\n", "models.append(('NB', GaussianNB()))\n", "models.append(('SVM', SVC()))\n", "# evaluate each model in turn\n", "results = []\n", "names = []\n", "scoring = 'accuracy'\n", "for name, model in models:\n", "\tkfold = model_selection.KFold(n_splits=10, random_state=seed)\n", "\tcv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)\n", "\tresults.append(cv_results)\n", "\tnames.append(name)\n", "\tmsg = \"%s: %f (%f)\" % (name, cv_results.mean(), cv_results.std())\n", "\tprint(msg)\n", "# boxplot algorithm comparison\n", "fig = plt.figure()\n", "fig.suptitle('Algorithm Comparison')\n", "ax = fig.add_subplot(111)\n", "plt.boxplot(results)\n", "ax.set_xticklabels(names)\n", "plt.show()\n", "\n", "\n", "'''\n", "\n", "'''" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 2 }