{ "cells": [ { "cell_type": "markdown", "id": "f39ddf36", "metadata": {}, "source": [ "# To use this notebook...\n", "\n", "1) Click on the _Loading Cell_ below and push Command + Return to run it. \n", "\n", "2) Scroll down to the _Active Cell_, and enter any text you want where it says \"ADD YOUR TEXT HERE\" (if you don't have a block of text handy, any of [Paul Graham's essays](http://www.paulgraham.com/articles.html) are easy to copy and paste)\n", "\n", "3) Push Command + Return for and it will output the summary (takes 5-60 seconds, depending on the length of the text)." ] }, { "cell_type": "code", "execution_count": 2, "id": "6bac4741", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] /Users/zachobront/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/zachobront/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "### LOADING CELL ###\n", "\n", "import re\n", "import nltk\n", "nltk.download('punkt')\n", "from nltk.tokenize import sent_tokenize\n", "from nltk.corpus import stopwords\n", "nltk.download('stopwords')\n", "stop_words = stopwords.words('english')\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import networkx as nx\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "word_embeddings = {}\n", "f = open('glove.6B.100d.shortened.txt', encoding='utf-8')\n", "for line in f:\n", " x = line.split()\n", " word = x[0]\n", " coefs = np.asarray(x[1:], dtype='float32')\n", " word_embeddings[word] = coefs\n", "f.close()\n", "\n", "def clean_text(text):\n", " sentences = [x for x in sent_tokenize(text)]\n", " cleanish = [s.lower() for s in pd.Series(sentences).str.replace(\"[^a-zA-Z]\", \" \", regex=True)]\n", " clean = [\" \".join([word for word in sentence.split() if word not in stop_words]) for sentence in cleanish]\n", " return sentences, clean\n", "\n", "def create_sentence_vecs(text):\n", " sen_vecs = []\n", " for sentence in text:\n", " if len(sentence) != 0:\n", " word_vecs = [word_embeddings.get(word, np.zeros((100,))) for word in sentence.split(\" \")]\n", " sen_sum = sum(word_vecs)\n", " v = sen_sum / (len(sentence.split()) + 0.001)\n", " else:\n", " v = np.zeros((100,))\n", " sen_vecs.append(v)\n", " return sen_vecs\n", "\n", "def calculate_rankings(sentences, sen_vecs):\n", " sim_mat = np.zeros([len(sentences), len(sentences)])\n", " for i in range(len(sentences)):\n", " for j in range(len(sentences)):\n", " if i != j:\n", " sim_mat[i][j] = cosine_similarity(sen_vecs[i].reshape(1, 100), sen_vecs[j].reshape(1, 100))[0, 0]\n", " nx_graph = nx.from_numpy_array(sim_mat)\n", " scores = nx.pagerank(nx_graph)\n", " ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)\n", " output = [s for (i, s) in ranked_sentences]\n", " return output\n", "\n", "def create_summary(text, points=1):\n", " sentences, clean = clean_text(text)\n", " sen_vecs = create_sentence_vecs(clean)\n", " output = calculate_rankings(sentences, sen_vecs)\n", " return output[:points]" ] }, { "cell_type": "code", "execution_count": 4, "id": "08570759", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HERE'S THE GIST: \n", "ADD YOUR TEXT HERE\n" ] } ], "source": [ "### ACTIVE CELL ###\n", "\n", "text = '''\n", "ADD YOUR TEXT HERE\n", "'''\n", "\n", "summary = create_summary(text)\n", "\n", "print(f\"HERE'S THE GIST: {summary[0]}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 5 }