{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-15-topicmodel.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/ServiceNow_v3_Approach2_Type2.ipynb","timestamp":1644615574916}],"collapsed_sections":[],"mount_file_id":"1ta_4FdFlx2sliaxexxYOlBUHs0Phr7Vs","authorship_tag":"ABX9TyNZOiSp92pWVj2asaXXOLqD"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","source":["# Topic Modeling on ServiceNow Dataset"],"metadata":{"id":"iCE_tZl7ueIG"}},{"cell_type":"code","metadata":{"id":"oqg7Pwgp2dom"},"source":["# !pip install pyLDAvis\n","# !pip install rake-nltk"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"fRfjyGqznR17"},"source":["path = '/content'"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"32ZuGdl9NTr_"},"source":["from nltk.util import ngrams"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"ycACxmXqm6FI","colab":{"base_uri":"https://localhost:8080/","height":86},"executionInfo":{"status":"ok","timestamp":1601016105413,"user_tz":-330,"elapsed":2265,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"45fa081d-3052-40ee-b7b4-7b2c71e2eee5"},"source":["import os\n","import pandas as pd\n","\n","import re\n","import numpy as np\n","from tqdm import tqdm\n","import seaborn as sns\n","import matplotlib.pyplot as plt\n","from itertools import groupby\n","from bs4 import BeautifulSoup\n","from collections import OrderedDict\n","from wordcloud import WordCloud\n","from sklearn.feature_extraction.text import CountVectorizer\n","from sklearn.decomposition import LatentDirichletAllocation as LDA\n","\n","from pyLDAvis import sklearn as sklearn_lda\n","import pickle \n","import pyLDAvis\n","\n","from rake_nltk import Rake\n","\n","import nltk\n","from nltk.corpus import stopwords\n","from nltk.stem import PorterStemmer\n","from nltk.stem import WordNetLemmatizer\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","\n","nltk.download('stopwords')\n","stopwords = list(set(stopwords.words('english')))\n","\n","ps = PorterStemmer()\n","nltk.download('wordnet') \n","lemmatizer = WordNetLemmatizer()"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data]   Package stopwords is already up-to-date!\n","[nltk_data] Downloading package wordnet to /root/nltk_data...\n","[nltk_data]   Unzipping corpora/wordnet.zip.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"00zX_eAonLxj","colab":{"base_uri":"https://localhost:8080/","height":156},"executionInfo":{"status":"ok","timestamp":1601015450499,"user_tz":-330,"elapsed":1905,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"36575d4c-7339-4cab-ba20-98d2d4e7fd42"},"source":["files = os.listdir(path); files"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['incident_dump.xlsx',\n"," 'incident_Cleaned.xlsx',\n"," 'serviceNow_Model.h5',\n"," 'Subset_Database.xlsx',\n"," 'Subset_Hardware.xlsx',\n"," 'Subset_Inquiry.xlsx',\n"," 'Subset_Network.xlsx',\n"," 'Subset_Software.xlsx']"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"code","metadata":{"id":"qGZb-j1pnTlG"},"source":["Subset_Database = pd.read_excel(os.path.join(path,'Subset_Database.xlsx'))\n","Subset_Hardware = pd.read_excel(os.path.join(path,'Subset_Hardware.xlsx'))\n","Subset_Inquiry = pd.read_excel(os.path.join(path,'Subset_Inquiry.xlsx'))\n","Subset_Network = pd.read_excel(os.path.join(path,'Subset_Network.xlsx'))\n","Subset_Software = pd.read_excel(os.path.join(path,'Subset_Software.xlsx'))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"8kzyUZcynO-z"},"source":["def clean_l1(text, extended=False):\n","  text = ' ' + text + ' '\n","  text = text.lower()\n","  text = re.sub(r'[^a-z ]', ' ', text)\n","  text = ' '.join(text.split())\n","  text = ' '.join([lemmatizer.lemmatize(w, 'v') for w in text.split()])\n","  text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])\n","  if extended:\n","    text = ' '.join([w for w in text.split() if not w in stopwords])\n","  return text"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"7SRLwqcyq5yY"},"source":["def plot_10_most_common_words(count_data, count_vectorizer):\n","    words = count_vectorizer.get_feature_names()\n","    total_counts = np.zeros(len(words))\n","    for t in count_data:\n","        total_counts+=t.toarray()[0]\n","    \n","    count_dict = (zip(words, total_counts))\n","    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]\n","    words = [w[0] for w in count_dict]\n","    counts = [w[1] for w in count_dict]\n","    x_pos = np.arange(len(words)) \n","    \n","    plt.figure(2, figsize=(5, 5/1.6180))\n","    plt.subplot(title='10 most common words')\n","    sns.set_context(\"notebook\", font_scale=1.25, rc={\"lines.linewidth\": 2.5})\n","    sns.barplot(x_pos, counts, palette='husl')\n","    plt.xticks(x_pos, words, rotation=90) \n","    plt.xlabel('words')\n","    plt.ylabel('counts')\n","    plt.show()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"ZZNzPlOl09z5"},"source":["def print_topics(model, count_vectorizer, n_top_words):\n","  words = count_vectorizer.get_feature_names()\n","  for topic_idx, topic in enumerate(model.components_):\n","      print(\"\\nTopic #%d:\" % topic_idx)\n","      print(\" \".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"hD9GpzJZzjAo"},"source":["def topic_model(df, outfile='output'):\n","  # select short_desc column\n","  df = df[['Short description']]\n","  # clean\n","  df['Short description'] = df['Short description'].apply(clean_l1)\n","  # drop null and duplicates\n","  df = df.dropna().drop_duplicates()\n","  # drop records with 1 or less word\n","  df = df[df['Short description'].str.split().str.len().gt(1)]\n","  # compile list of documents\n","  docs = df['Short description'].tolist()\n","  # keyword extraction and tokenization\n","  min=2; max=2\n","  r = Rake(min_length=min, max_length=max)\n","  r.extract_keywords_from_sentences(docs)\n","  keywords = r.get_ranked_phrases()[:100]\n","  def tokenize(text):\n","    tokens = text.split()\n","    tokens = list(set(tokens))\n","    for i in range(2,max+1):\n","      ngram = list(ngrams(tokens, i))\n","      ngram = [' '.join(list(x)) for x in ngram]\n","      tokens.extend(ngram)\n","    tokens = list(set(tokens) & set(keywords))\n","    return tokens\n","  # top-k most common words\n","  count_vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize)\n","  count_data = count_vectorizer.fit_transform(docs)\n","  # plot_10_most_common_words(count_data, count_vectorizer)\n","  # generate wordcloud\n","  sum_words = count_data.sum(axis=0) \n","  words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]\n","  words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)\n","  words_dict = dict(words_freq)\n","  wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=3, contour_color='steelblue')\n","  wordcloud.generate_from_frequencies(words_dict)\n","  wordcloud.to_file(outfile+'.png')\n","  # generate topic model\n","  number_topics = 5 #PARAM\n","  number_words = 10 #PARAM\n","  lda = LDA(n_components=number_topics, n_jobs=-1)\n","  lda.fit(count_data)\n","  # print(\"Topics found via LDA:\")\n","  # print_topics(lda, count_vectorizer, number_words)\n","  # LDA Visualization\n","  LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)\n","  pyLDAvis.save_html(LDAvis_prepared, outfile+'.html')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"3A5zyk7YnkdE"},"source":["topic_model(Subset_Database, 'Subset_Database')\n","topic_model(Subset_Hardware, 'Subset_Hardware')\n","topic_model(Subset_Inquiry, 'Subset_Inquiry')\n","topic_model(Subset_Network, 'Subset_Network')\n","topic_model(Subset_Software, 'Subset_Software')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"rkO4_5EQJWks"},"source":[""],"execution_count":null,"outputs":[]}]}