{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f39ddf36",
   "metadata": {},
   "source": [
    "# To use this notebook...\n",
    "\n",
    "1) Click on the _Loading Cell_ below and push Command + Return to run it. \n",
    "\n",
    "2) Scroll down to the _Active Cell_, and enter any text you want where it says \"ADD YOUR TEXT HERE\" (if you don't have a block of text handy, any of [Paul Graham's essays](http://www.paulgraham.com/articles.html) are easy to copy and paste)\n",
    "\n",
    "3) Push Command + Return for and it will output the summary (takes 5-60 seconds, depending on the length of the text)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6bac4741",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     /Users/zachobront/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/zachobront/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "### LOADING CELL ###\n",
    "\n",
    "import re\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "from nltk.tokenize import sent_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "nltk.download('stopwords')\n",
    "stop_words = stopwords.words('english')\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import networkx as nx\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "word_embeddings = {}\n",
    "f = open('glove.6B.100d.shortened.txt', encoding='utf-8')\n",
    "for line in f:\n",
    "    x = line.split()\n",
    "    word = x[0]\n",
    "    coefs = np.asarray(x[1:], dtype='float32')\n",
    "    word_embeddings[word] = coefs\n",
    "f.close()\n",
    "\n",
    "def clean_text(text):\n",
    "    sentences = [x for x in sent_tokenize(text)]\n",
    "    cleanish = [s.lower() for s in pd.Series(sentences).str.replace(\"[^a-zA-Z]\", \" \", regex=True)]\n",
    "    clean = [\" \".join([word for word in sentence.split() if word not in stop_words]) for sentence in cleanish]\n",
    "    return sentences, clean\n",
    "\n",
    "def create_sentence_vecs(text):\n",
    "    sen_vecs = []\n",
    "    for sentence in text:\n",
    "        if len(sentence) != 0:\n",
    "            word_vecs = [word_embeddings.get(word, np.zeros((100,))) for word in sentence.split(\" \")]\n",
    "            sen_sum = sum(word_vecs)\n",
    "            v = sen_sum / (len(sentence.split()) + 0.001)\n",
    "        else:\n",
    "            v = np.zeros((100,))\n",
    "        sen_vecs.append(v)\n",
    "    return sen_vecs\n",
    "\n",
    "def calculate_rankings(sentences, sen_vecs):\n",
    "    sim_mat = np.zeros([len(sentences), len(sentences)])\n",
    "    for i in range(len(sentences)):\n",
    "        for j in range(len(sentences)):\n",
    "            if i != j:\n",
    "                sim_mat[i][j] = cosine_similarity(sen_vecs[i].reshape(1, 100), sen_vecs[j].reshape(1, 100))[0, 0]\n",
    "    nx_graph = nx.from_numpy_array(sim_mat)\n",
    "    scores = nx.pagerank(nx_graph)\n",
    "    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)\n",
    "    output = [s for (i, s) in ranked_sentences]\n",
    "    return output\n",
    "\n",
    "def create_summary(text, points=1):\n",
    "    sentences, clean = clean_text(text)\n",
    "    sen_vecs = create_sentence_vecs(clean)\n",
    "    output = calculate_rankings(sentences, sen_vecs)\n",
    "    return output[:points]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "08570759",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HERE'S THE GIST: \n",
      "ADD YOUR TEXT HERE\n"
     ]
    }
   ],
   "source": [
    "### ACTIVE CELL ###\n",
    "\n",
    "text = '''\n",
    "ADD YOUR TEXT HERE\n",
    "'''\n",
    "\n",
    "summary = create_summary(text)\n",
    "\n",
    "print(f\"HERE'S THE GIST: {summary[0]}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}