{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<CENTER>\n",
    "    <h1>LegalTech HackZurich 2020</h1>\n",
    "    <br><hr></br>\n",
    "    <a href=\"http://universidad.ch/hackzurich/suite/\" class=\"icons\"><img src=\"https://hackzurich.com/media/pages/workshops/15-legaltech/4173366116-1599065131/legaltech-600x-copy-600x.png\" style=\"width:20%\"></a>\n",
    "</CENTER>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       "div.input {\n",
       "    display:none;\n",
       "}\n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<style>\n",
    "div.input {\n",
    "    display:none;\n",
    "}\n",
    "</style>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "\n",
    "with open('result.txt', 'r') as f:\n",
    "    sample = f.read()\n",
    "\n",
    "##\n",
    "sentences = nltk.sent_tokenize(sample)\n",
    "tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]\n",
    "tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]\n",
    "chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_entity_names(t):\n",
    "    entity_names = []\n",
    "\n",
    "    if hasattr(t, 'label') and t.label:\n",
    "        if t.label() == 'NE':\n",
    "            entity_names.append(' '.join([child[0] for child in t]))\n",
    "        else:\n",
    "            for child in t:\n",
    "                entity_names.extend(extract_entity_names(child))\n",
    "\n",
    "    return entity_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['John C. Malone', 'Columbus Holding', 'Mr. Malone', 'Mr. Malone', 'Mr. Malone', 'Mr. Malone', 'Berkshire Hathaway Inc.', 'SEC', 'Berkshire Hathaway', 'Cox', 'Cox International Stock Fund', 'SEC', 'Cox', 'Harris Associates L.P.', 'SEC', 'Harris', 'Harris Associates', 'Liberty Global', 'Liberty Global', 'SEC', 'Robert R. Bennett', 'Mr. Bennett', 'William H. Gates Ill', 'SEC', 'William H. Gates Ill', 'Bill', 'Melinda Gates Foundation Trust', 'Melinda French Gates', 'SEC', 'LLC', 'ROIC', 'LLC', 'William', 'Jason', 'Concert', 'Liberty Global', 'Sunrise', 'Sunrise', 'Annual Report', 'Liberty Global', 'Offeror', 'Liberty Global Group', 'Liberty Global', 'Sunrise As of August', 'Sunrise', 'Sunrise Shares']\n"
     ]
    }
   ],
   "source": [
    "entity_names = []\n",
    "for tree in chunked_sentences:\n",
    "    # Print results per sentence\n",
    "    # print extract_entity_names(tree)\n",
    "\n",
    "    entity_names.extend(extract_entity_names(tree))\n",
    "\n",
    "# Print all entity names\n",
    "#print entity_names\n",
    "\n",
    "# Print unique entity names\n",
    "print(entity_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "file = open(\"names-extracted.txt\", \"w\")\n",
    "\n",
    "for element in entity_names:\n",
    "    print(f\"{element}\", file=file)\n",
    "file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "os.system('cat names-extracted.txt | sort | uniq > names-sorted-uniq.txt')\n",
    "#names-sorte-uniq.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Annual Report\n",
      "\n",
      "Berkshire Hathaway\n",
      "\n",
      "Berkshire Hathaway Inc.\n",
      "\n",
      "Bill\n",
      "\n",
      "Columbus Holding\n",
      "\n",
      "Concert\n",
      "\n",
      "Cox\n",
      "\n",
      "Cox International Stock Fund\n",
      "\n",
      "Harris\n",
      "\n",
      "Harris Associates\n",
      "\n",
      "Harris Associates L.P.\n",
      "\n",
      "Jason\n",
      "\n",
      "John C. Malone\n",
      "\n",
      "Liberty Global\n",
      "\n",
      "Liberty Global Group\n",
      "\n",
      "LLC\n",
      "\n",
      "Melinda French Gates\n",
      "\n",
      "Melinda Gates Foundation Trust\n",
      "\n",
      "Mr. Bennett\n",
      "\n",
      "Mr. Malone\n",
      "\n",
      "Offeror\n",
      "\n",
      "Robert R. Bennett\n",
      "\n",
      "ROIC\n",
      "\n",
      "SEC\n",
      "\n",
      "Sunrise\n",
      "\n",
      "Sunrise As of August\n",
      "\n",
      "Sunrise Shares\n",
      "\n",
      "William\n",
      "\n",
      "William H. Gates Ill\n",
      "\n"
     ]
    }
   ],
   "source": [
    "a_file = open(\"names-sorted-uniq.txt\")\n",
    "\n",
    "lines = a_file.readlines()\n",
    "for line in lines:\n",
    "    print(line)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<CENTER>\n",
    "    <a href=\"http://universidad.ch/hackzurich/suite/\" class=\"icons\"><img src=\"http://universidad.ch/hackzurich/suite/logos/footer-hackathon.png\" style=\"width:90%\"></a>\n",
    "</CENTER>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}