{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.optimize import minimize\n",
    "from scipy.special import expit, xlogy\n",
    "from sklearn.datasets import load_boston\n",
    "from sklearn.neural_network import MLPRegressor as skMLPRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MLPRegressor():\n",
    "    def __init__(self, hidden_layer_sizes=(100,), alpha=0.0001, max_iter=200, random_state=0):\n",
    "        self.hidden_layer_sizes = hidden_layer_sizes\n",
    "        self.alpha = alpha\n",
    "        self.max_iter = max_iter\n",
    "        self.random_state = random_state\n",
    "\n",
    "    def _pack(self, coefs, intercepts):\n",
    "        return np.hstack([cur.ravel() for cur in coefs + intercepts])\n",
    "\n",
    "    def _unpack(self, packed_coef):\n",
    "        for i in range(self.n_layers_ - 1):\n",
    "            start, end, shape = self._coef_indptr[i]\n",
    "            self.coefs_[i] = np.reshape(packed_coef[start:end], shape)\n",
    "            start, end = self._intercept_indptr[i]\n",
    "            self.intercepts_[i] = packed_coef[start:end]\n",
    "\n",
    "    def _forward_pass(self, activations):\n",
    "        for i in range(self.n_layers_ - 1):\n",
    "            activations[i + 1] = np.dot(activations[i], self.coefs_[i]) + self.intercepts_[i]\n",
    "            if i + 1 != self.n_layers_ - 1:\n",
    "                activations[i + 1] = expit(activations[i + 1])\n",
    "        return activations\n",
    "\n",
    "    def _cost_grad(self, packed_coef, X, y_train, activations, deltas, coef_grads, intercept_grads):\n",
    "        self._unpack(packed_coef)\n",
    "        # forward pass\n",
    "        activations = self._forward_pass(activations)\n",
    "        loss = np.mean(np.square(y_train - activations[-1])) / 2\n",
    "        loss += 0.5 * self.alpha * np.sum([np.dot(c.ravel(), c.ravel()) for c in self.coefs_]) / X.shape[0]\n",
    "        # backward pass\n",
    "        deltas[self.n_layers_ - 2] = activations[-1] - y_train\n",
    "        coef_grads[self.n_layers_ - 2] = np.dot(activations[self.n_layers_ - 2].T, deltas[self.n_layers_ - 2])\n",
    "        coef_grads[self.n_layers_ - 2] += (self.alpha * self.coefs_[self.n_layers_ - 2])\n",
    "        coef_grads[self.n_layers_ - 2] /= X.shape[0]\n",
    "        intercept_grads[self.n_layers_ - 2] = np.mean(deltas[self.n_layers_ - 2], axis=0)\n",
    "        for i in range(self.n_layers_ - 2, 0, -1):\n",
    "            deltas[i - 1] = np.dot(deltas[i], self.coefs_[i].T)\n",
    "            deltas[i - 1] *= activations[i] * (1 - activations[i])\n",
    "            coef_grads[i - 1] = np.dot(activations[i - 1].T, deltas[i - 1])\n",
    "            coef_grads[i - 1] += (self.alpha * self.coefs_[i - 1])\n",
    "            coef_grads[i - 1] /= X.shape[0]\n",
    "            intercept_grads[i - 1] = np.mean(deltas[i - 1], axis=0)\n",
    "        grad = self._pack(coef_grads, intercept_grads)\n",
    "        return loss, grad\n",
    "        \n",
    "    def fit(self, X, y):\n",
    "        y_train = y[:, np.newaxis]\n",
    "        self.n_outputs_ = y_train.shape[1]\n",
    "        layer_units = ([X.shape[1]] + list(self.hidden_layer_sizes) + [self.n_outputs_])\n",
    "        self.n_layers_ = len(layer_units)\n",
    "        self.coefs_, self.intercepts_ = [], []\n",
    "        rng = np.random.RandomState(self.random_state)\n",
    "        for i in range(self.n_layers_ - 1):\n",
    "            init_bound = np.sqrt(2 / (layer_units[i] + layer_units[i + 1]))\n",
    "            self.coefs_.append(rng.uniform(-init_bound, init_bound, (layer_units[i], layer_units[i + 1])))\n",
    "            self.intercepts_.append(rng.uniform(-init_bound, init_bound, layer_units[i + 1]))\n",
    "        activations = [X]\n",
    "        activations.extend(np.empty((X.shape[0], n_fan_out)) for n_fan_out in layer_units[1:])\n",
    "        deltas = [np.empty_like(a_layer) for a_layer in activations[1:]]\n",
    "        coef_grads = [np.empty((n_fan_in, n_fan_out)) for n_fan_in, n_fan_out in zip(layer_units[:-1], layer_units[1:])]\n",
    "        intercept_grads = [np.empty(n_fan_out) for n_fan_out in layer_units[1:]]\n",
    "        self._coef_indptr, self._intercept_indptr = [], []\n",
    "        start = 0\n",
    "        for i in range(self.n_layers_ - 1):\n",
    "            end = start + (self.coefs_[i].shape[0] * self.coefs_[i].shape[1])\n",
    "            self._coef_indptr.append((start, end, (self.coefs_[i].shape[0], self.coefs_[i].shape[1])))\n",
    "            start = end\n",
    "        for i in range(self.n_layers_ - 1):\n",
    "            end = start + self.intercepts_[i].shape[0]\n",
    "            self._intercept_indptr.append((start, end))\n",
    "            start = end\n",
    "        packed_coef = self._pack(self.coefs_, self.intercepts_)\n",
    "        res = minimize(fun=self._cost_grad, jac=True, x0=packed_coef,\n",
    "                       args=(X, y_train, activations, deltas, coef_grads, intercept_grads), method='L-BFGS-B',\n",
    "                       options = {\"maxiter\": self.max_iter})\n",
    "        self._unpack(res.x)\n",
    "        return self\n",
    "\n",
    "    def _predict(self, X):\n",
    "        layer_units = ([X.shape[1]] + list(self.hidden_layer_sizes) + [self.n_outputs_])\n",
    "        activations = [X]\n",
    "        activations.extend(np.empty((X.shape[0], n_fan_out)) for n_fan_out in layer_units[1:])\n",
    "        self._forward_pass(activations)\n",
    "        y_pred = activations[-1]\n",
    "        return y_pred\n",
    "\n",
    "    def predict(self, X):\n",
    "        y_pred = self._predict(X)\n",
    "        return y_pred.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = load_boston(return_X_y=True)\n",
    "clf1 = MLPRegressor(hidden_layer_sizes=(5,), max_iter=10, random_state=0).fit(X, y)\n",
    "clf2 = skMLPRegressor(hidden_layer_sizes=(5,), max_iter=10, activation=\"logistic\", solver=\"lbfgs\",\n",
    "                      random_state=0, tol=1e-5).fit(X, y)\n",
    "assert len(clf1.coefs_) == len(clf2.coefs_)\n",
    "for i in range(len(clf1.coefs_)):\n",
    "    assert np.allclose(clf1.coefs_[i], clf2.coefs_[i])\n",
    "assert len(clf2.intercepts_) == len(clf2.intercepts_)\n",
    "for i in range(len(clf1.intercepts_)):\n",
    "    assert np.allclose(clf1.intercepts_[i], clf2.intercepts_[i])\n",
    "pred1 = clf1.predict(X)\n",
    "pred2 = clf2.predict(X)\n",
    "assert np.allclose(pred1, pred2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = load_boston(return_X_y=True)\n",
    "clf1 = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=10, random_state=0).fit(X, y)\n",
    "clf2 = skMLPRegressor(hidden_layer_sizes=(5, 5), max_iter=10, activation=\"logistic\", solver=\"lbfgs\",\n",
    "                      random_state=0, tol=1e-5).fit(X, y)\n",
    "assert len(clf1.coefs_) == len(clf2.coefs_)\n",
    "for i in range(len(clf1.coefs_)):\n",
    "    assert np.allclose(clf1.coefs_[i], clf2.coefs_[i])\n",
    "assert len(clf2.intercepts_) == len(clf2.intercepts_)\n",
    "for i in range(len(clf1.intercepts_)):\n",
    "    assert np.allclose(clf1.intercepts_[i], clf2.intercepts_[i])\n",
    "pred1 = clf1.predict(X)\n",
    "pred2 = clf2.predict(X)\n",
    "assert np.allclose(pred1, pred2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dev",
   "language": "python",
   "name": "dev"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}