{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Went to order a Playstation 5 on the preorder date.', 'Could not get one to safe my life... but there were plenty of PS5 DualSense Wireless controllers available.', \"So that's something, right?\", 'Sony should be ashamed of themselves.']\n" ] } ], "source": [ "#1st STEP: TOKENIZE\n", "import nltk\n", "from nltk.tokenize import sent_tokenize, word_tokenize\n", "\n", "review = \"Went to order a Playstation 5 on the preorder date. Could not get one to safe my life... but there were plenty of PS5 DualSense Wireless controllers available. So that's something, right? Sony should be ashamed of themselves.\"\n", "\n", "print(sent_tokenize(review))\n", "\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Went', 'to', 'order', 'a', 'Playstation', '5', 'on', 'the', 'preorder', 'date', '.', 'Could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenty', 'of', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', 'that', \"'s\", 'something', ',', 'right', '?', 'Sony', 'should', 'be', 'ashamed', 'of', 'themselves', '.']\n" ] } ], "source": [ "print(word_tokenize(review))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Went', 'to', 'order', 'a', 'Playstation', '5', 'on', 'the', 'preorder', 'date', '.', 'Could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenty', 'of', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', 'that', \"'s\", 'something', ',', 'right', '?', 'Sony', 'should', 'be', 'ashamed', 'of', 'themselves', '.']\n", "['Went', 'order', 'Playstation', '5', 'preorder', 'date', '.', 'Could', 'get', 'one', 'safe', 'life', '...', 'plenty', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', \"'s\", 'something', ',', 'right', '?', 'Sony', 'ashamed', '.']\n" ] } ], "source": [ "#2nd STEP: DELETE STOPWORDS\n", "\n", "from nltk.corpus import stopwords\n", "\n", "stopwords = set(stopwords.words('english'))\n", "\n", "sent_as_token = sent_tokenize(review)\n", "words_as_token = word_tokenize(review)\n", "\n", "filtered_sent = []\n", "\n", "for word in words_as_token:\n", " if word not in stopwords:\n", " filtered_sent += [word]\n", " \n", "print(words_as_token)\n", "\n", "print(filtered_sent)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['went', 'to', 'order', 'a', 'playstat', '5', 'on', 'the', 'preorder', 'date', '.', 'could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenti', 'of', 'ps5', 'dualsens', 'wireless', 'control', 'avail', '.', 'So', 'that', \"'s\", 'someth', ',', 'right', '?', 'soni', 'should', 'be', 'asham', 'of', 'themselv', '.']\n" ] } ], "source": [ "#3rd STEP: STEMMING\n", "from nltk.stem import PorterStemmer\n", "\n", "stemmer = PorterStemmer()\n", "\n", "stemmed_sent = []\n", "\n", "for word in words_as_token:\n", " stemmed_sent += [stemmer.stem(word)]\n", "\n", "print(stemmed_sent)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('Went', 'NN'), ('to', 'TO'), ('order', 'NN'), ('a', 'DT'), ('Playstation', 'NN'), ('5', 'CD'), ('on', 'IN'), ('the', 'DT'), ('preorder', 'NN'), ('date', 'NN'), ('.', '.'), ('Could', 'MD'), ('not', 'RB'), ('get', 'VB'), ('one', 'CD'), ('to', 'TO'), ('safe', 'VB'), ('my', 'PRP$'), ('life', 'NN'), ('...', ':'), ('but', 'CC'), ('there', 'EX'), ('were', 'VBD'), ('plenty', 'NN'), ('of', 'IN'), ('PS5', 'NNP'), ('DualSense', 'NNP'), ('Wireless', 'NNP'), ('controllers', 'NNS'), ('available', 'JJ'), ('.', '.'), ('So', 'IN'), ('that', 'DT'), (\"'s\", 'VBZ'), ('something', 'NN'), (',', ','), ('right', 'RB'), ('?', '.'), ('Sony', 'NNP'), ('should', 'MD'), ('be', 'VB'), ('ashamed', 'VBN'), ('of', 'IN'), ('themselves', 'PRP'), ('.', '.')]\n" ] } ], "source": [ "#4th STEP: SPEECH TAGGING\n", "tagged_words = []\n", "\n", "tagged_words = nltk.pos_tag(words_as_token) #you tag a LIST of tokenized words\n", " \n", "print(tagged_words)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " Went/NN\n", " to/TO\n", " order/NN\n", " a/DT\n", " Playstation/NN\n", " 5/CD\n", " on/IN\n", " the/DT\n", " preorder/NN\n", " date/NN\n", " ./.\n", " Could/MD\n", " not/RB\n", " get/VB\n", " one/CD\n", " to/TO\n", " safe/VB\n", " my/PRP$\n", " life/NN\n", " .../:\n", " but/CC\n", " there/EX\n", " were/VBD\n", " plenty/NN\n", " of/IN\n", " (Chunk PS5/NNP DualSense/NNP Wireless/NNP)\n", " controllers/NNS\n", " available/JJ\n", " ./.\n", " So/IN\n", " that/DT\n", " 's/VBZ\n", " something/NN\n", " ,/,\n", " right/RB\n", " ?/.\n", " (Chunk Sony/NNP)\n", " should/MD\n", " be/VB\n", " ashamed/VBN\n", " of/IN\n", " themselves/PRP\n", " ./.)\n", "(Chunk PS5/NNP DualSense/NNP Wireless/NNP)\n", "(Chunk Sony/NNP)\n" ] } ], "source": [ "#5th STEP: CHUNKING\n", "chunk_of_grammar = r\"\"\"Chunk: {**+?}\"\"\"\n", "chunkParser = nltk.RegexpParser(chunk_of_grammar)\n", "chunked = chunkParser.parse(tagged_words)\n", "\n", "print(chunked)\n", "for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):\n", " print(subtree)\n", "\n", "chunked.draw()\n", " " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1]\n" ] } ], "source": [ "#Using random forest classifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.datasets import make_classification\n", "\n", "#Notes: Lets generetate a random classification problem\n", "\n", "X, y = make_classification(n_samples=1000, n_features=4,\n", " n_informative=2, n_redundant=0,\n", " random_state=0, shuffle=False)\n", "\n", "#print(X, y)\n", "\n", "#Notes: Create the random forest. random_state controls bootstrapping\n", "clf = RandomForestClassifier(max_depth=2, random_state=0)\n", "\n", "#Notes: Build a forest of trees from the training set (X, y)\n", "\n", "clf.fit(X, y)\n", "\n", "#Notes: The predicted class of an input sample is a vote by the trees in the forest, \n", "#weighted by their probability estimates. That is, the predicted class is the one with \n", "#highest mean probability estimate across the trees.\n", "\n", "print(clf.predict([[0, 0, 0, 0]]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }