{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "accelerator": "GPU", "colab": { "name": "03_AppStore_PreProc_Classification_Pandas.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "9jR2mNIedthU" }, "source": [ "# Obsei Tutorial 03\n", "## This example shows following Obsei workflow\n", " 1. Observe: App Store's app reviews\n", " 2. Pre-process: Clean review text with properly\n", " 3. Analyze: Classify review text within given category list\n", " 4. Inform: Provide all data in Pandas DataFrame\n", " 5. Store: Store data in Google Drive in CSV format" ] }, { "cell_type": "markdown", "metadata": { "id": "mBx_RzpFVFQL" }, "source": [ "## Install Obsei from latest code, perform these steps -\n", "- Select GPU RunType for faster computation \n", "- Restart Runtime after installation\n" ] }, { "cell_type": "code", "metadata": { "id": "Oh74E2T9HO-F", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a95aff05-684e-44e4-e21a-fbbfe8010453" }, "source": [ "!pip install obsei[all]" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Collecting git+https://github.com/lalitpagaria/obsei.git\n", " Cloning https://github.com/lalitpagaria/obsei.git to /tmp/pip-req-build-wl_1hpon\n", " Running command git clone -q https://github.com/lalitpagaria/obsei.git /tmp/pip-req-build-wl_1hpon\n", "Requirement already satisfied (use --upgrade to upgrade): obsei==0.0.9 from git+https://github.com/lalitpagaria/obsei.git in /usr/local/lib/python3.7/dist-packages\n", "Requirement already satisfied: app-store-reviews-reader==1.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.2)\n", "Requirement already satisfied: atlassian-python-api==3.10.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.10.0)\n", "Requirement already satisfied: beautifulsoup4==4.9.3 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.9.3)\n", "Requirement already satisfied: blis==0.7.4 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.7.4)\n", "Requirement already satisfied: cachetools==4.2.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.2.2)\n", "Requirement already satisfied: catalogue==2.0.4 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.0.4)\n", "Requirement already satisfied: certifi==2021.5.30 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2021.5.30)\n", "Requirement already satisfied: chardet==4.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.0.0)\n", "Requirement already satisfied: click==7.1.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (7.1.2)\n", "Requirement already satisfied: courlan==0.4.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.4.0)\n", "Requirement already satisfied: cssselect==1.1.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.1.0)\n", "Requirement already satisfied: cymem==2.0.5 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.0.5)\n", "Requirement already satisfied: dateparser==1.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.0)\n", "Requirement already satisfied: deprecated==1.2.12 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.2.12)\n", "Requirement already satisfied: elasticsearch==7.13.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (7.13.1)\n", "Requirement already satisfied: feedparser==6.0.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (6.0.2)\n", "Requirement already satisfied: filelock==3.0.12 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.12)\n", "Requirement already satisfied: gnews==0.1.3 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.1.3)\n", "Requirement already satisfied: google-api-core==1.30.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.30.0)\n", "Requirement already satisfied: google-api-python-client==2.8.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.8.0)\n", "Requirement already satisfied: google-auth==1.30.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.30.2)\n", "Requirement already satisfied: google-auth-httplib2==0.1.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.1.0)\n", "Requirement already satisfied: google-play-scraper==1.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.0)\n", "Requirement already satisfied: googleapis-common-protos==1.53.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.53.0)\n", "Requirement already satisfied: greenlet==1.1.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.1.0)\n", "Requirement already satisfied: htmldate==0.8.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.8.1)\n", "Requirement already satisfied: httplib2==0.19.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.19.1)\n", "Requirement already satisfied: huggingface-hub==0.0.8 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.0.8)\n", "Requirement already satisfied: idna==2.10 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.10)\n", "Requirement already satisfied: importlib-metadata==4.5.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.5.0)\n", "Requirement already satisfied: jinja2==3.0.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.1)\n", "Requirement already satisfied: joblib==1.0.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.1)\n", "Requirement already satisfied: justext==2.2.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.2.0)\n", "Requirement already satisfied: lxml==4.6.3 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.6.3)\n", "Requirement already satisfied: markupsafe==2.0.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.0.1)\n", "Requirement already satisfied: mmh3==3.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.0)\n", "Requirement already satisfied: murmurhash==1.0.5 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.5)\n", "Requirement already satisfied: nltk==3.6.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.6.2)\n", "Requirement already satisfied: numpy==1.20.3 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.20.3)\n", "Requirement already satisfied: oauthlib==3.1.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.1.1)\n", "Requirement already satisfied: packaging==20.9 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (20.9)\n", "Requirement already satisfied: pandas==1.2.4 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.2.4)\n", "Requirement already satisfied: pathy==0.5.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.5.2)\n", "Requirement already satisfied: praw==7.2.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (7.2.0)\n", "Requirement already satisfied: prawcore==2.1.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.1.0)\n", "Requirement already satisfied: preshed==3.0.5 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.5)\n", "Requirement already satisfied: presidio-analyzer==2.2.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.2.1)\n", "Requirement already satisfied: presidio-anonymizer==2.2.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.2.1)\n", "Requirement already satisfied: protobuf==3.17.3 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.17.3)\n", "Requirement already satisfied: pyasn1==0.4.8 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.4.8)\n", "Requirement already satisfied: pyasn1-modules==0.2.8 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.2.8)\n", "Requirement already satisfied: pycryptodome==3.10.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.10.1)\n", "Requirement already satisfied: pydantic==1.7.4 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.7.4)\n", "Requirement already satisfied: pyparsing==2.4.7 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.4.7)\n", "Requirement already satisfied: python-dateutil==2.8.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.8.1)\n", "Requirement already satisfied: python-facebook-api==0.9.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.9.2)\n", "Requirement already satisfied: pytz==2021.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2021.1)\n", "Requirement already satisfied: pyyaml==5.4.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (5.4.1)\n", "Requirement already satisfied: readability-lxml==0.8.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.8.1)\n", "Requirement already satisfied: reddit-rss-reader==1.3.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.3.2)\n", "Requirement already satisfied: regex==2020.11.13 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2020.11.13)\n", "Requirement already satisfied: requests==2.25.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.25.1)\n", "Requirement already satisfied: requests-file==1.5.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.5.1)\n", "Requirement already satisfied: requests-oauthlib==1.3.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.3.0)\n", "Requirement already satisfied: rsa==4.7.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.7.2)\n", "Requirement already satisfied: sacremoses==0.0.45 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.0.45)\n", "Requirement already satisfied: searchtweets-v2==1.0.7 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.7)\n", "Requirement already satisfied: sentencepiece==0.1.95 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.1.95)\n", "Requirement already satisfied: sgmllib3k==1.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.0)\n", "Requirement already satisfied: six==1.16.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.16.0)\n", "Requirement already satisfied: slack-sdk==3.6.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.6.0)\n", "Requirement already satisfied: smart-open==3.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.0)\n", "Requirement already satisfied: soupsieve==2.2.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.2.1)\n", "Requirement already satisfied: spacy==3.0.5 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.5)\n", "Requirement already satisfied: spacy-legacy==3.0.5 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.5)\n", "Requirement already satisfied: sqlalchemy==1.4.17 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.4.17)\n", "Requirement already satisfied: srsly==2.4.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.4.1)\n", "Requirement already satisfied: thinc==8.0.4 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (8.0.4)\n", "Requirement already satisfied: tld==0.12.6 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.12.6)\n", "Requirement already satisfied: tldextract==3.1.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.1.0)\n", "Requirement already satisfied: tokenizers==0.10.3 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.10.3)\n", "Requirement already satisfied: tqdm==4.61.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.61.0)\n", "Requirement already satisfied: trafilatura==0.8.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.8.2)\n", "Requirement already satisfied: transformers==4.6.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (4.6.1)\n", "Requirement already satisfied: tweet-preprocessor==0.6.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.6.0)\n", "Requirement already satisfied: typer==0.3.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.3.2)\n", "Requirement already satisfied: typing-extensions==3.10.0.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.10.0.0)\n", "Requirement already satisfied: tzlocal==2.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.1)\n", "Requirement already satisfied: update-checker==0.18.0 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.18.0)\n", "Requirement already satisfied: uritemplate==3.0.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.0.1)\n", "Requirement already satisfied: urllib3==1.26.5 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.26.5)\n", "Requirement already satisfied: vadersentiment==3.3.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.3.2)\n", "Requirement already satisfied: wasabi==0.8.2 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (0.8.2)\n", "Requirement already satisfied: websocket-client==1.0.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.0.1)\n", "Requirement already satisfied: wrapt==1.12.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.12.1)\n", "Requirement already satisfied: zenpy==2.0.24 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (2.0.24)\n", "Requirement already satisfied: zipp==3.4.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (3.4.1)\n", "Requirement already satisfied: torch==1.8.1 in /usr/local/lib/python3.7/dist-packages (from obsei==0.0.9) (1.8.1)\n", "Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.30.0->obsei==0.0.9) (57.0.0)\n", "Requirement already satisfied: cattrs<2.0,>=1.1; python_version >= \"3.7\" and python_version < \"4.0\" in /usr/local/lib/python3.7/dist-packages (from python-facebook-api==0.9.2->obsei==0.0.9) (1.7.1)\n", "Requirement already satisfied: responses>=0.11 in /usr/local/lib/python3.7/dist-packages (from python-facebook-api==0.9.2->obsei==0.0.9) (0.13.3)\n", "Requirement already satisfied: attrs<21.0.0,>=20.1.0 in /usr/local/lib/python3.7/dist-packages (from python-facebook-api==0.9.2->obsei==0.0.9) (20.3.0)\n", "Building wheels for collected packages: obsei\n", " Building wheel for obsei (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for obsei: filename=obsei-0.0.9-cp37-none-any.whl size=65557 sha256=cce33049986ee20144625a85f90699a6ae020c7a8454bb4f156750446385e03b\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-4be2m6lr/wheels/49/1a/6e/2fd83c9a275b7096fc615a0edef2d55b1fc33c3751ba45c1ad\n", "Successfully built obsei\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "52xZY6EVVFhW" }, "source": [ "## Mount your Google Drive to store CSV" ] }, { "cell_type": "code", "metadata": { "id": "nn216pBrEzIz", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "18e156b7-9f74-4dfb-f5e7-f70c5c8af95c" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "JtTiwHGFVF5f" }, "source": [ "## Configure following input -\n", "- `name`: Brand name of App\n", "- `category_list`: List of categories to perform review text classification\n", "- `identifier`: Id of the app, it can be found at the end of the url of app in app store\n", "- `country`: Country of reviews\n", "- `lookup_period`: How many old reviews to collect (**Note**: Apple rate limit and provide max 450 reviews only)\n", "- `extra_stop_words`: Extra stop words top clean from review text" ] }, { "cell_type": "code", "metadata": { "id": "-ex2Uj01oyte" }, "source": [ "name = \"zomato\"\n", "category_list = [\"easy order placement\", \"realtime order tracking\", \"easy payment options\", \"rewards and discounts\",\"user interface\", \"social media Integration\"]\n", "identifier = \"434613896\"\n", "country = \"in\"\n", "lookup_period = \"365d\"\n", "extra_stop_words = [\"i\", \"-\", \"day\", \"will\", \".\", \"use\", \"n\", \"without\", \"please\", \"app\", \"ha\", \"ho\", \"nt\", \"wa\", \n", " \"thi\", \"plz\", \"pleas\", \"ff\", \"ya\", \"thank\", \"you\", \"thanks\", \"mai\"]" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "7a4yOP-9eZYJ" }, "source": [ "## Configure columns of Pandas DataFrame\n", "`included_cols` will only be returned by Pandas Sink and `rename_cols_dict` will rename selected `included_cols` columns to desired one" ] }, { "cell_type": "code", "metadata": { "id": "DBE-jy4QeZzu" }, "source": [ "included_cols = [f\"segmented_data_classifier_data_{category}\" for category in category_list]\n", "included_cols.append(\"segmented_data_classifier_data_positive\")\n", "included_cols.append(\"segmented_data_classifier_data_negative\")\n", "included_cols.append(\"processed_text\")\n", "included_cols.append(\"meta_at\")\n", "included_cols.append(\"meta_date\")\n", "included_cols.append(\"meta_published date\")\n", "included_cols.append(\"meta_rating\")\n", "# included_cols.append(\"meta_title\")\n", "included_cols.append(\"meta_publisher_title\")\n", "\n", "rename_cols_dict = {f\"segmented_data_classifier_data_{category}\": category for category in category_list}\n", "rename_cols_dict[\"segmented_data_classifier_data_positive\"] = \"positive\"\n", "rename_cols_dict[\"segmented_data_classifier_data_negative\"] = \"negative\"\n", "rename_cols_dict[\"processed_text\"] = \"text\"\n", "rename_cols_dict[\"meta_at\"] = \"time\"\n", "rename_cols_dict[\"meta_date\"] = \"time\"\n", "rename_cols_dict[\"meta_rating\"] = \"ratings\"\n", "rename_cols_dict[\"meta_published date\"] = \"time\"\n", "# rename_cols_dict[\"meta_title\"] = \"title\"\n", "rename_cols_dict[\"meta_publisher_title\"] = \"news publisher\"\n", "rename_cols_dict['Unnamed: 0'] = 'reviews'" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "CPP5pM0eVGq2" }, "source": [ "## Configure App Store Review Observer" ] }, { "cell_type": "code", "metadata": { "id": "Yr8ucKlJHRUg" }, "source": [ "from obsei.source.appstore_scrapper import (\n", " AppStoreScrapperConfig,\n", " AppStoreScrapperSource,\n", ")\n", "\n", "source_config = AppStoreScrapperConfig(\n", " countries=[country],\n", " app_id=identifier,\n", " lookup_period=lookup_period\n", ")\n", "\n", "source = AppStoreScrapperSource()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kgJVp1DgVIpM" }, "source": [ "## Configure TextCleaner as Pre-Processor to clean review text\n", "These cleaning function will run serially" ] }, { "cell_type": "code", "metadata": { "id": "esmT-8IhpHvp", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5db65872-1017-4bf4-fc15-1dcd5cbc1790" }, "source": [ "from obsei.preprocessor.text_cleaner import TextCleaner, TextCleanerConfig\n", "from obsei.preprocessor.text_cleaning_function import *\n", "\n", "text_cleaner_config = TextCleanerConfig(\n", " stop_words=extra_stop_words,\n", " cleaning_functions = [\n", " ToLowerCase(),\n", " RemoveWhiteSpaceAndEmptyToken(),\n", " RemovePunctuation(),\n", " RemoveSpecialChars(),\n", " DecodeUnicode(),\n", " RemoveDateTime(),\n", " RemoveStopWords(),\n", " RemoveStopWords(stop_words=extra_stop_words),\n", " RemoveWhiteSpaceAndEmptyToken(),\n", " ]\n", ")\n", "\n", "text_cleaner = TextCleaner()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ], "name": "stderr" } ] }, { "cell_type": "markdown", "metadata": { "id": "6_DNazzVVHVV" }, "source": [ "## Configure Classification Analyzer\n", "**Note**: Select model from https://huggingface.co/models?pipeline_tag=zero-shot-classification, if you want to try different one" ] }, { "cell_type": "code", "metadata": { "id": "26ASFOm_HW2s" }, "source": [ "from obsei.analyzer.classification_analyzer import ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer\n", "\n", "analyzer_config=ClassificationAnalyzerConfig(\n", " labels=category_list,\n", ")\n", "\n", "text_analyzer = ZeroShotClassificationAnalyzer(\n", " model_name_or_path=\"typeform/mobilebert-uncased-mnli\",\n", " device=\"auto\"\n", ")" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "vTdPn25kVH9l" }, "source": [ "## Configure Pandas DataFrame Informer" ] }, { "cell_type": "code", "metadata": { "id": "1aLYa-QBHaDs" }, "source": [ "from pandas import DataFrame\n", "from obsei.sink.pandas_sink import PandasSink, PandasSinkConfig\n", "\n", "sink_config = PandasSinkConfig(\n", " dataframe=DataFrame(),\n", " include_columns_list=included_cols\n", ")\n", "sink = PandasSink()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "n0hDL8ChVMuN" }, "source": [ "## Fetch app reviews" ] }, { "cell_type": "code", "metadata": { "id": "YmTz7qKayCTW" }, "source": [ "source_response_list = source.lookup(source_config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "fPi5DG8oVNK-" }, "source": [ "## PreProcess review text to clean it" ] }, { "cell_type": "code", "metadata": { "id": "TnbjTr4vyHOk" }, "source": [ "cleaner_response_list = text_cleaner.preprocess_input(\n", " input_list=source_response_list,\n", " config=text_cleaner_config\n", ")" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5T1hDzG9VNnd" }, "source": [ "## Analyze reviews to perform classification\n", "**Note**: This is compute heavy step" ] }, { "cell_type": "code", "metadata": { "id": "qZyRJIcjyEIu", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "60d24e0c-7f95-4e52-b311-276f31329d0e" }, "source": [ "analyzer_response_list = text_analyzer.analyze_input(\n", " source_response_list=cleaner_response_list,\n", " analyzer_config=analyzer_config\n", ")" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" ], "name": "stderr" } ] }, { "cell_type": "markdown", "metadata": { "id": "-VhyKGLWVOSB" }, "source": [ "## Inform review data in form of Pandas DataFrame" ] }, { "cell_type": "code", "metadata": { "id": "zFPdISnWHr9j", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "9ebd0b7e-d5e3-4777-cf8c-60e813ead9a9" }, "source": [ "sink_config = PandasSinkConfig(\n", " dataframe=DataFrame(),\n", " include_columns_list=included_cols\n", ")\n", "\n", "dataframe = sink.send_data(analyzer_response_list, sink_config)\n", "dataframe.rename(rename_cols_dict,axis=1,inplace=True)\n", "\n", "\n", "dataframe[\"brand\"] = name\n", "dataframe" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", " | text | \n", "positive | \n", "user interface | \n", "rewards and discounts | \n", "negative | \n", "realtime order tracking | \n", "social media Integration | \n", "easy order placement | \n", "easy payment options | \n", "ratings | \n", "time | \n", "brand | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "awesome unmade zomato user switched limited re... | \n", "0.72 | \n", "0.11 | \n", "0.06 | \n", "0.02 | \n", "0.02 | \n", "0.02 | \n", "0.01 | \n", "0.01 | \n", "5 | \n", "2021-07-10 12:21:41 | \n", "zomato | \n", "
1 | \n", "best service mast service thii time | \n", "0.99 | \n", "0.26 | \n", "0.17 | \n", "0.00 | \n", "0.16 | \n", "0.01 | \n", "0.21 | \n", "0.29 | \n", "5 | \n", "2021-07-10 12:20:34 | \n", "zomato | \n", "
2 | \n", "nice nice | \n", "1.00 | \n", "0.70 | \n", "0.38 | \n", "0.00 | \n", "0.30 | \n", "0.06 | \n", "0.44 | \n", "0.58 | \n", "5 | \n", "2021-07-10 12:20:07 | \n", "zomato | \n", "
3 | \n", "listening single cheese burger concern love zo... | \n", "0.98 | \n", "0.81 | \n", "0.00 | \n", "0.00 | \n", "0.05 | \n", "0.00 | \n", "0.06 | \n", "0.07 | \n", "5 | \n", "2021-07-10 12:19:20 | \n", "zomato | \n", "
4 | \n", "good good | \n", "1.00 | \n", "0.62 | \n", "0.42 | \n", "0.00 | \n", "0.50 | \n", "0.05 | \n", "0.53 | \n", "0.69 | \n", "5 | \n", "2021-07-10 12:15:17 | \n", "zomato | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
495 | \n", "nice gud | \n", "1.00 | \n", "0.87 | \n", "0.30 | \n", "0.00 | \n", "0.14 | \n", "0.08 | \n", "0.36 | \n", "0.68 | \n", "5 | \n", "2021-07-07 15:54:35 | \n", "zomato | \n", "
496 | \n", "bad experience delivery guy refused take rs no... | \n", "0.00 | \n", "0.29 | \n", "0.08 | \n", "1.00 | \n", "0.02 | \n", "0.03 | \n", "0.00 | \n", "0.00 | \n", "1 | \n", "2021-07-07 15:54:24 | \n", "zomato | \n", "
497 | \n", "shikha excellent | \n", "1.00 | \n", "0.94 | \n", "0.45 | \n", "0.00 | \n", "0.48 | \n", "0.06 | \n", "0.70 | \n", "0.91 | \n", "5 | \n", "2021-07-07 15:53:40 | \n", "zomato | \n", "
498 | \n", "ordered delivery yet pathetic service | \n", "0.00 | \n", "0.27 | \n", "0.01 | \n", "1.00 | \n", "0.00 | \n", "0.02 | \n", "0.00 | \n", "0.00 | \n", "1 | \n", "2021-07-07 15:47:03 | \n", "zomato | \n", "
499 | \n", "super awesome experience | \n", "0.99 | \n", "0.37 | \n", "0.06 | \n", "0.00 | \n", "0.09 | \n", "0.01 | \n", "0.01 | \n", "0.09 | \n", "5 | \n", "2021-07-07 15:40:27 | \n", "zomato | \n", "
500 rows × 12 columns
\n", "