{ "cells": [ { "metadata": { "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nimport os\nprint(os.listdir(\"../input\"))\n\n# Any results you write to the current directory are saved as output.", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "trusted": false }, "cell_type": "code", "source": "a1 = pd.read_csv('../input/articles1.csv',index_col=0)\na2 = pd.read_csv('../input/articles2.csv',index_col=0)\na3 = pd.read_csv('../input/articles3.csv',index_col=0)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "7e9376b3282ea215f037162e393d69a7dcbdea26", "_cell_guid": "ab78cba9-3b57-401e-8929-73b471d10eab", "trusted": false }, "cell_type": "code", "source": "df = pd.concat([a1,a2,a3])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "6f64ea7678508c9163907dd62c8c8fe158e1f25c", "_cell_guid": "a0c22de9-806a-4149-a19f-a81139f80281", "trusted": false }, "cell_type": "code", "source": "# save memory\ndel a1, a2, a3", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "c508847a37594db3471c6b8fd321df918cf778b3", "_cell_guid": "f4b52949-8e3d-441b-9084-ad4c1ab7e770", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.head()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "68a2a4e49dc4dea7b0bfb22622430404ba0fa5b6", "_cell_guid": "a5471e95-c234-4d1d-b689-83f130b88b5c", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "import matplotlib.pyplot as plt\nplt.figure(figsize=(10,7))\ndf.publication.value_counts().plot(kind='bar')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "5a3ceff40105221f78b89628b84a20080baba994", "_cell_guid": "46517f5c-d533-494c-b1b8-eceb3ea93254", "trusted": false }, "cell_type": "code", "source": "doc = df.loc[0,'content']", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "2178ed59937b307ebf38092a1c923b6c3c7871f0", "_cell_guid": "7e3b6c54-83ea-4c22-b3a3-a7156d86b6b8", "trusted": false }, "cell_type": "code", "source": "import spacy\nfrom spacy import displacy\nnlp = spacy.load('en')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "854d655262fd47260a228d55342bf515c68e5375", "_cell_guid": "3df9aa8d-66f2-4d92-9aa8-db9a00425ae7", "trusted": false }, "cell_type": "code", "source": "doc = nlp(doc)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "cf80b1fc5dcd06f4f5495dc8f3b71040c9856279", "scrolled": false, "_cell_guid": "ccd6b4e0-16e3-452f-85e7-585206b0b4e3", "trusted": false }, "cell_type": "code", "source": "displacy.render(doc,style='ent',jupyter=True)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "cf62ab27a2ef26721e77e643b2c373e4b5be04b6", "_cell_guid": "c12ba739-92d9-4ee5-8308-1afe4a60c6a2", "trusted": false }, "cell_type": "code", "source": "from tqdm import tqdm, tqdm_notebook", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "8ca3185749bae470a59550b9fdd873f032cf0321", "_cell_guid": "7efa8bc5-9ac4-4a47-a1b2-55f5ea84e176", "trusted": false }, "cell_type": "code", "source": "nlp = spacy.load('en',\n disable=['parser', \n 'tagger',\n 'textcat'])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "ea067825a440fb27096c446e6e724f05bd60670a", "_cell_guid": "02b5cdb9-9588-4fff-8884-ed00a4201733", "trusted": false }, "cell_type": "code", "source": "frames = []\nfor i in tqdm_notebook(range(1000)):\n doc = df.loc[i,'content']\n text_id = df.loc[i,'id']\n doc = nlp(doc)\n ents = [(e.text, e.start_char, e.end_char, e.label_) \n for e in doc.ents \n if len(e.text.strip(' -—')) > 0]\n frame = pd.DataFrame(ents)\n frame['id'] = text_id\n frames.append(frame)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "20daa610abaf51fdec5d6a1ea61f5ea934fef859", "_cell_guid": "f69bac3a-0772-4db5-b53d-9b953b4a1203", "trusted": false }, "cell_type": "code", "source": "npf = pd.concat(frames)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "651ae5c9d91fcbcf5ea55fcc76fda8dcb64e3141", "_cell_guid": "82176b0d-de6b-4e51-81b7-22560ceb448a", "trusted": false }, "cell_type": "code", "source": "npf.head()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "34a84d0695332794b17918b595e3cedb7c01637c", "_cell_guid": "39d8da30-8deb-43af-b500-c9e9abd07f50", "trusted": false }, "cell_type": "code", "source": "npf.columns = ['Text','Start','Stop','Type','id']", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "791fb5495989aa40dcc3781b406212efffa1de1a", "_cell_guid": "d1032953-44f3-48db-bab6-a8c95fda888d", "trusted": false }, "cell_type": "code", "source": "plt.figure(figsize=(10,7))\nnpf.Type.value_counts().plot(kind='bar')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "0351e8d99d9c505a090c54272a0d56a5197671f6", "_cell_guid": "766b57a5-7a28-447b-9998-ae076fe7b267", "trusted": false }, "cell_type": "code", "source": "orgs = npf[npf.Type == 'ORG']", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "8330a0cd27cb2a49394faa180d13aa6e0ca527fa", "_cell_guid": "b3d20fe2-8fcb-44a6-931b-9569866a5f4f", "trusted": false }, "cell_type": "code", "source": "plt.figure(figsize=(10,7))\norgs.Text.value_counts()[:15].plot(kind='bar')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "1b19abb873204c9eb28eb646b9848a81b0b45ec6", "_cell_guid": "394900e4-17e4-4e93-afc6-0515c4066e71", "trusted": false }, "cell_type": "code", "source": "orgs.groupby(['id','Text']).size()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "85980a782b8708b09646481dcc5a575a0100b420", "_cell_guid": "94bcbc92-8810-4459-9878-0c0401de3634", "trusted": false }, "cell_type": "code", "source": "doc = 'Google to buy Apple'\ndoc = nlp(doc)\ndisplacy.render(doc,style='dep',jupyter=True, options={'distance':120})", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "a52738bc06199823a64f6deef96238a648edb006", "_cell_guid": "21b13d90-0f2b-40b9-86c5-cf656a6afd49", "trusted": false }, "cell_type": "code", "source": "for chunk in doc.noun_chunks:\n print(chunk.text,'|' , chunk.root.text,'|', chunk.root.dep_,'|',\n chunk.root.head.text)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "a8b27cb8ecb73292aa97633dca66674f9bfe0a52", "_cell_guid": "d6ad6f49-0996-4abf-8505-3abe7858c14c", "trusted": false }, "cell_type": "code", "source": "\nfor token in doc:\n print(token.text,'|', token.lemma_,'|', token.pos_,'|', token.tag_,'|', token.dep_,'|',\n token.shape_,'|', token.is_alpha,'|', token.is_stop)", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "47ed826b20573c1175c1fecfbb2564ab0bb9dad5", "_cell_guid": "6feefdf7-3154-4615-91bb-1de1923550b1" }, "cell_type": "markdown", "source": "# Fine tuning SpaCy NER" }, { "metadata": { "collapsed": true, "_uuid": "52693b068e4743c3baa8245dfa91d13b5539d6bf", "_cell_guid": "dc389118-92c9-498c-b2c9-28abeefc4280", "trusted": false }, "cell_type": "code", "source": "import spacy\nimport random", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "54c000dcab1cef3c824660c47e153290f57fe336", "_cell_guid": "415b3cd8-38ea-45b5-a736-9dbd2c6f3e62", "trusted": false }, "cell_type": "code", "source": "TRAIN_DATA = [\n ('Who is Shaka Khan?', {\n 'entities': [(7, 17, 'PERSON')]\n }),\n ('I like London and Berlin.', {\n 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n })\n]", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "0eafb8b97b4393c13d4b54e418ea263bc799c306", "_cell_guid": "de11db33-506d-4a1a-a662-7a5b7649e8bf", "trusted": false }, "cell_type": "code", "source": "nlp = spacy.load('en')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "56b68520473c5ea09a32bc484802579fc3589015", "_cell_guid": "10adba0f-bfcd-4cd8-b067-a8f010e99fbd", "trusted": false }, "cell_type": "code", "source": "# create the built-in pipeline components and add them to the pipeline\n# nlp.create_pipe works for built-ins that are registered with spaCy\nif 'ner' not in nlp.pipe_names:\n ner = nlp.create_pipe('ner')\n nlp.add_pipe(ner, last=True)\n# otherwise, get it so we can add labels\nelse:\n ner = nlp.get_pipe('ner')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "2f8b39c673444941714f25e105fa64170ffeefc1", "_cell_guid": "470ba485-f8ca-424a-9609-e7ec1340a728", "trusted": false }, "cell_type": "code", "source": "# add labels\nfor _, annotations in TRAIN_DATA:\n for ent in annotations.get('entities'):\n ner.add_label(ent[2])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "fdc22eca85c71fffaefb9de6eda7cf89c395a31e", "_cell_guid": "eb12a084-222f-4abb-abab-eef2c4fa0eae", "trusted": false }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "9d58ea7ea4f92cbd116a5c10e91a0a75bc911527", "_cell_guid": "e1175b06-d2f2-4a16-b2d4-8429776ddbb1", "trusted": false }, "cell_type": "code", "source": "n_iter = 5\n\n# get names of other pipes to disable them during training\nother_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n\nwith nlp.disable_pipes(*other_pipes): # only train NER\n optimizer = nlp._optimizer \n if not nlp._optimizer:\n optimizer = nlp.begin_training()\n \n for itn in range(n_iter):\n random.shuffle(TRAIN_DATA)\n losses = {}\n for text, annotations in TRAIN_DATA:\n nlp.update(\n [text], # batch of texts\n [annotations], # batch of annotations\n drop=0.5, # dropout - make it harder to memorise data\n sgd=optimizer, # callable to update weights\n losses=losses)\n print(losses)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "fd639fa814d38e9480202c9f35f7ef80a1972d4e", "_cell_guid": "7f3b5228-59fd-494d-a6cd-278684907445", "trusted": false }, "cell_type": "code", "source": "# test the trained model\nfor text, _ in TRAIN_DATA:\n doc = nlp(text)\n print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "f9d6dbbbb2238a5515e68fd33b00402f2a278e5f", "_cell_guid": "60e86902-29cf-405b-a499-0c8def98e22d" }, "cell_type": "markdown", "source": "# Rule based matching" }, { "metadata": { "collapsed": true, "_uuid": "9a5ceeeee0f707f898d6c571f131866c28a3c5e3", "_cell_guid": "ce3007f2-5fbb-4777-b7ec-31e3efc3f6f7", "trusted": false }, "cell_type": "code", "source": "import spacy\nfrom spacy.matcher import Matcher\n\nnlp = spacy.load('en')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "b825204a8958ba80a75354325cd89ec855d817f1", "_cell_guid": "4729e53e-1daf-43c8-b11f-6ba6df4b257f", "trusted": false }, "cell_type": "code", "source": "pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]\n\n\nmatcher.add('HelloWorld', None, pattern)\n\ndoc = nlp(u'Hello, world! Hello world!')\nmatches = matcher(doc)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "9fb27c729e5c6261a330c9f532783b81d32c3196", "_cell_guid": "1a377ca3-a7c0-4917-87e2-cad9ff9ebe9b", "trusted": false }, "cell_type": "code", "source": "matches", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "16d140a049c3e9b41a68282a48a0c21e0cb7098a", "_cell_guid": "9af707fc-f91d-4c8f-a682-f26525e2175b", "trusted": false }, "cell_type": "code", "source": "doc[0:3]", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "f757a7a246b019879fd48b65a6e565fa622a3c64", "_cell_guid": "1578d68c-2ced-4399-9a8c-0835cec29dfa", "trusted": false }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "9077efbc0ea0a00ddc592212c916e446c198b8c2", "_cell_guid": "22596515-1945-4712-b6e7-bedff67642c5", "trusted": false }, "cell_type": "code", "source": "df.title = df.title.fillna('')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "e11a33c657f77309f714a952125899e65e4e7151", "_cell_guid": "494c24d1-7fdb-426b-a69f-5525bab3e16e", "trusted": false }, "cell_type": "code", "source": "np.where(df.content.str.contains('iPhone'))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "8c1ea299cc2ed148b856678634b6df0771c9930d", "_cell_guid": "df5f98f2-3ed1-4476-a8ea-f7e826ace611", "trusted": false }, "cell_type": "code", "source": "df.loc[14]", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "d623581241696371b80c6faa9064eebd556d7b6f", "_cell_guid": "c9b05eb8-e139-46ea-a393-7e1edc7311ff", "trusted": false }, "cell_type": "code", "source": "import spacy\nfrom spacy.matcher import Matcher\n\nnlp = spacy.load('en')\nmatcher = Matcher(nlp.vocab)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "93d036a2e893f1c8412225746d74fdae0f5b351e", "_cell_guid": "73362d8d-fc55-4ba6-a8e6-ed42337a0878", "trusted": false }, "cell_type": "code", "source": "# Get the hash of the word 'PRODUCT'. This is required to set an entity.\nPRODUCT = nlp.vocab.strings['PRODUCT']\n\ndef add_product_ent(matcher, doc, i, matches):\n # Get the current match and create tuple of entity label, start and end.\n # Append entity to the doc's entity. (Don't overwrite doc.ents!)\n match_id, start, end = matches[i]\n doc.ents += ((PRODUCT, start, end),)\n\npattern1 = [{'LOWER': 'iphone'}]\npattern2 = [{'ORTH': 'iPhone'}, {'IS_DIGIT': True}]\n\nmatcher.add('iPhone', add_product_ent,pattern1, pattern2)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "5fe6939173de6a1b38606e2905fb8e90efc51499", "_cell_guid": "88baaeb0-b12b-4ff1-8e45-ef7665ee7e91", "trusted": false }, "cell_type": "code", "source": "matches = matcher(doc)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "44966f86f57f2b66550a875e0e2dc31070f661b7", "_cell_guid": "d3300432-2b77-47b5-8da0-318d3e1b7b48", "trusted": false }, "cell_type": "code", "source": "displacy.render(doc,style='ent',jupyter=True)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "5313313f23561696b8308606348c64b273cddd54", "_cell_guid": "6b10c64f-00e4-4333-ab26-3797d7791600", "trusted": false }, "cell_type": "code", "source": "def matcher_component(doc):\n matches = matcher(doc)\n return doc", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "0698c4308e7738bf458f9658dd33fa45330588f6", "_cell_guid": "75b571ec-37db-4ec2-8960-787523689b5e", "trusted": false }, "cell_type": "code", "source": "nlp.add_pipe(matcher_component,last=True)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "6b50a664ec6d07d5aabfd6e8e3e74e9a5a786b73", "_cell_guid": "7eef57af-ff16-467b-84db-175fbd00ba74", "trusted": false }, "cell_type": "code", "source": "doc = nlp(df.content.iloc[14])\n#matcher(doc)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "ac204a19975efb3c0e347573e71a750869d9cb3e", "_cell_guid": "16f1ff38-c045-49db-b0cd-0cfbdbb0160f", "trusted": false }, "cell_type": "code", "source": "displacy.render(doc,style='ent',jupyter=True)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "7060543d981438d9252e5373761b417c7c4a19a1", "_cell_guid": "af6bb57f-6fa1-4b46-901a-24658f8a74a4" }, "cell_type": "markdown", "source": "# Regex" }, { "metadata": { "collapsed": true, "_cell_guid": "fdd0b232-dbd7-48a6-beab-6b6011deb6c1", "_uuid": "531e7b844fae00e2a86ebfd16f35b4108e3aceee", "trusted": false }, "cell_type": "code", "source": "import re", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_cell_guid": "46cec129-9ab7-4bcb-adf1-8f2f52e64c61", "_uuid": "0ea99cbfe47253986206cb1b865db547e66829f3", "trusted": false }, "cell_type": "code", "source": "pattern = 'NL[0-9]{9}B[0-9]{2}'", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_cell_guid": "176ef283-d957-47a4-aba8-6d0897cd0996", "_uuid": "578e2768b179a9e6783819918b9d1a0e89111e3e", "trusted": false }, "cell_type": "code", "source": "my_string = 'ING Bank N.V. BTW:NL003028112B01'", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "ec97f613-530c-46eb-942e-f5c9c0d029f5", "_uuid": "f90b3e8b3b6494d927fd2168c90c5c4402380aaf", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "re.findall(pattern,my_string)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "4ff2aa71-d34b-4af4-a1fb-77b7cf0a3c2a", "_uuid": "7293819c4c74f44b5129f5ee2903c8351d330033", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "match = re.search(pattern,my_string)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "1b73dc11-d39b-4e07-954c-fe515ebe33b1", "_uuid": "db0626a26c7ca37115ad8304926bbfc4d5a6f013", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "match.span()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_cell_guid": "4befe83c-cc4f-4519-a443-58a65ae324a6", "_uuid": "d5119c6720366540991602d6bbade6cc4ba2f82c", "trusted": false }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.6.5", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" } }, "nbformat": 4, "nbformat_minor": 1 }