{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importing the libraries\n",
    "import bs4 as bs\n",
    "import urllib.request\n",
    "import re\n",
    "\n",
    "q1 = \"Mustafa_Kemal_Atatürk\"\n",
    "q2 = \"Donald_Trump\"\n",
    "q3 = 'Data_science'\n",
    "q4 = 'Machine_learning'\n",
    "\n",
    "# Gettings the data source\n",
    "from urllib.parse import quote  \n",
    "source = urllib.request.urlopen('https://en.wikipedia.org/wiki/'+ quote(q4)).read()\n",
    "\n",
    "# Parsing the data/ creating BeautifulSoup object\n",
    "soup = bs.BeautifulSoup(source,'lxml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fetching the data\n",
    "text = \"\"\n",
    "for paragraph in soup.find_all('p'):\n",
    "    text += paragraph.text\n",
    "\n",
    "# Preprocessing the data\n",
    "text = re.sub(r'\\[[0-9]*\\]',' ',text)\n",
    "text = re.sub(r'\\s+',' ',text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "str"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/uzaycetin/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "import heapq\n",
    "\n",
    "#Preprocessing the data\n",
    "text = re.sub(r'\\[[0-9]*\\]',' ',text)\n",
    "text = re.sub(r'\\s+',' ',text)\n",
    "clean_text = text.lower()\n",
    "clean_text = re.sub(r'\\W',' ',clean_text)\n",
    "clean_text = re.sub(r'\\d',' ',clean_text)\n",
    "clean_text = re.sub(r'\\s+',' ',clean_text)\n",
    "\n",
    "# Tokenize sentences\n",
    "sentences = nltk.sent_tokenize(text)\n",
    "\n",
    "# Stopword list\n",
    "stop_words = nltk.corpus.stopwords.words('english')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Word counts \n",
    "word2count = {}\n",
    "for word in nltk.word_tokenize(clean_text):\n",
    "    if word not in stop_words:\n",
    "        if word not in word2count.keys():\n",
    "            word2count[word] = 1\n",
    "        else:\n",
    "            word2count[word] += 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Converting counts to weights\n",
    "maxi = max(word2count.values())\n",
    "for key in word2count.keys():\n",
    "    word2count[key] = word2count[key]/maxi\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Product sentence scores    \n",
    "sent2score = {}\n",
    "for sentence in sentences:\n",
    "    for word in nltk.word_tokenize(sentence.lower()):\n",
    "        if word in word2count.keys():\n",
    "            if len(sentence.split(' ')) < 15:\n",
    "                if sentence not in sent2score.keys():\n",
    "                    sent2score[sentence] = word2count[word]\n",
    "                else:\n",
    "                    sent2score[sentence] += word2count[word]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---------------------------------------------------------\n",
      "Machine learning approaches in particular can suffer from different data biases.\n",
      "In machine learning, genetic algorithms found some uses in the 1980s and 1990s.\n",
      "Software suites containing a variety of machine learning algorithms include the following :\n",
      ":25 Machine learning, reorganized as a separate field, started to flourish in the 1990s.\n",
      "As a scientific endeavour, machine learning grew out of the quest for artificial intelligence.\n",
      "Machine learning and statistics are closely related fields.\n",
      "The name machine learning was coined in 1959 by Arthur Samuel.\n",
      "Machine learning poses a host of ethical questions.\n",
      "Sparse dictionary learning has also been applied in image de-noising.\n",
      "A popular heuristic method for sparse dictionary learning is K-SVD.\n"
     ]
    }
   ],
   "source": [
    "#Gettings best 10 lines             \n",
    "best_sentences = heapq.nlargest(10, sent2score, key=sent2score.get)\n",
    "\n",
    "print('---------------------------------------------------------')\n",
    "for sentence in best_sentences:\n",
    "    print(sentence)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}