{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Minimal CoreNLP Scala example\n", "\n", "You'll need to get CoreNLP jars, for example by loading it as a dependency in some Maven project.\n", "\n", "We have to use such method because CoreNLP-models has classifier 'models' and currently jupyter-scala doesn't allow classifiers in dependency loading syntax." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ammonite.ops.Path\n", "import $ivy.`org.slf4j:slf4j-api:1.7.6`\n", "import $ivy.`com.google.protobuf:protobuf-java:3.0.0`\n", "import $ivy.`joda-time:joda-time:2.9.4`\n", "import $ivy.`de.jollyday:jollyday:0.5.1`\n", "\n", "val coreNLPVersion = \"3.8.0\"\n", "val myHome = \"\"\n", "val pathPrefix = s\"$myHome/.m2/repository\"\n", "val stanfordPrefix = s\"${pathPrefix}/edu/stanford/nlp/stanford-corenlp/$coreNLPVersion\"\n", "\n", "interp.load.cp(\n", " Seq(\n", " Path(s\"${stanfordPrefix}/stanford-corenlp-$coreNLPVersion.jar\"),\n", " Path(s\"${stanfordPrefix}/stanford-corenlp-$coreNLPVersion-models.jar\")\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[32mimport \u001b[39m\u001b[36medu.stanford.nlp.simple._\n", "\u001b[39m\n", "\u001b[32mimport \u001b[39m\u001b[36mscala.collection.JavaConverters._\u001b[39m" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import edu.stanford.nlp.simple._\n", "import scala.collection.JavaConverters._" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[36msentenceText\u001b[39m: \u001b[32mString\u001b[39m = \u001b[32m\"Chomsky's colorless green ideas sleep furiously.\"\u001b[39m\n", "\u001b[36msentence\u001b[39m: \u001b[32mSentence\u001b[39m = Chomsky's colorless green ideas sleep furiously." ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val sentenceText = \"Chomsky's colorless green ideas sleep furiously.\"\n", "val sentence = new Sentence(sentenceText)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[36mres4\u001b[39m: \u001b[32mjava\u001b[39m.\u001b[32mutil\u001b[39m.\u001b[32mList\u001b[39m[\u001b[32mString\u001b[39m] = [PERSON, O, O, O, O, O, O, O]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentence.nerTags" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[36mres5\u001b[39m: \u001b[32mjava\u001b[39m.\u001b[32mutil\u001b[39m.\u001b[32mList\u001b[39m[\u001b[32mString\u001b[39m] = [NNP, POS, JJ, JJ, NNS, VBP, RB, .]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentence.posTags" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROOT\n", " S\n", " NP\n", " NP\n", " NNP\n", " Chomsky\n", " POS\n", " 's\n", " JJ\n", " colorless\n", " JJ\n", " green\n", " NNS\n", " ideas\n", " VP\n", " VBP\n", " sleep\n", " ADVP\n", " RB\n", " furiously\n", " .\n", " .\n" ] }, { "data": { "text/plain": [ "\u001b[36mparseTree\u001b[39m: \u001b[32medu\u001b[39m.\u001b[32mstanford\u001b[39m.\u001b[32mnlp\u001b[39m.\u001b[32mtrees\u001b[39m.\u001b[32mTree\u001b[39m = (ROOT (S (NP (NP (NNP Chomsky) (POS 's)) (JJ colorless) (JJ green) (NNS ideas)) (VP (VBP sleep) (ADVP (RB furiously))) (. .)))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val parseTree = sentence.parse()\n", "parseTree.indentedListPrint" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Garden path sentences\n", "\n", "We'll try to parse some [garden path sentences](https://en.wikipedia.org/wiki/Garden_path_sentence)." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "\u001b[36mgardenPathSentences\u001b[39m: \u001b[32mList\u001b[39m[\u001b[32mSentence\u001b[39m] = \u001b[33mList\u001b[39m(\n", " The government plans to raise taxes were approved.,\n", " The complex houses married and single soldiers and their families.,\n", " The horse raced past the barn fell.,\n", " The old man the boat.\n", ")\n", "\u001b[36mres7_1\u001b[39m: \u001b[32mList\u001b[39m[\u001b[32mList\u001b[39m[(\u001b[32mString\u001b[39m, \u001b[32mString\u001b[39m)]] = \u001b[33mList\u001b[39m(\n", " \u001b[33mList\u001b[39m(\n", " (\u001b[32m\"The\"\u001b[39m, \u001b[32m\"DT\"\u001b[39m),\n", " (\u001b[32m\"government\"\u001b[39m, \u001b[32m\"NN\"\u001b[39m),\n", " (\u001b[32m\"plans\"\u001b[39m, \u001b[32m\"VBZ\"\u001b[39m),\n", " (\u001b[32m\"to\"\u001b[39m, \u001b[32m\"TO\"\u001b[39m),\n", " (\u001b[32m\"raise\"\u001b[39m, \u001b[32m\"VB\"\u001b[39m),\n", " (\u001b[32m\"taxes\"\u001b[39m, \u001b[32m\"NNS\"\u001b[39m),\n", " (\u001b[32m\"were\"\u001b[39m, \u001b[32m\"VBD\"\u001b[39m),\n", " (\u001b[32m\"approved\"\u001b[39m, \u001b[32m\"VBN\"\u001b[39m),\n", " (\u001b[32m\".\"\u001b[39m, \u001b[32m\".\"\u001b[39m)\n", " ),\n", "\u001b[33m...\u001b[39m" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val gardenPathSentences = List(\n", " \"The government plans to raise taxes were approved.\",\n", " \"The complex houses married and single soldiers and their families.\",\n", " \"The horse raced past the barn fell.\",\n", " \"The old man the boat.\" \n", " ).map(str => new Sentence(str))\n", "\n", "gardenPathSentences.map {\n", " sent => \n", " sent.words.asScala.toList.zip(\n", " sent.posTags.asScala.toList\n", " )\n", "}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence:\n", "The government plans to raise taxes were approved.\n", "\n", "Parse tree:\n", "ROOT\n", " S\n", " NP\n", " DT\n", " The\n", " NN\n", " government\n", " VP\n", " VBZ\n", " plans\n", " S\n", " VP\n", " TO\n", " to\n", " VP\n", " VB\n", " raise\n", " SBAR\n", " S\n", " NP\n", " NNS\n", " taxes\n", " VP\n", " VBD\n", " were\n", " VP\n", " VBN\n", " approved\n", " .\n", " .\n", "Sentence:\n", "The complex houses married and single soldiers and their families.\n", "\n", "Parse tree:\n", "ROOT\n", " NP\n", " NP\n", " DT\n", " The\n", " ADJP\n", " JJ\n", " complex\n", " NNS\n", " houses\n", " NP\n", " NP\n", " VBN\n", " married\n", " CC\n", " and\n", " JJ\n", " single\n", " NNS\n", " soldiers\n", " CC\n", " and\n", " NP\n", " PRP$\n", " their\n", " NNS\n", " families\n", " .\n", " .\n", "Sentence:\n", "The horse raced past the barn fell.\n", "\n", "Parse tree:\n", "ROOT\n", " S\n", " NP\n", " DT\n", " The\n", " NN\n", " horse\n", " VP\n", " VBD\n", " raced\n", " SBAR\n", " S\n", " NP\n", " IN\n", 