{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "2020-06-20-simple-scraping-nlp-bs4-distilbert.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true, "mount_file_id": "1V5UfUhjtiArTRWsFdQ3c5ovAhpl3ZK0j", "authorship_tag": "ABX9TyMp847BPQOf8E+WgvI8ar/K" }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "EF4jikZn8Vxd" }, "source": [ "# Job scraping and clustering\n", "> Simple web scraping with BeautifulSoup4 and NLP with DistilBERT \n", "\n", "- toc: true\n", "- badges: true\n", "- comments: true\n", "- categories: [scraping, bert]\n", "- image: " ] }, { "cell_type": "markdown", "metadata": { "id": "RjzTML0XaBxB" }, "source": [ "## Part 1 - Environment Setup" ] }, { "cell_type": "code", "metadata": { "id": "wHTCNnba06md" }, "source": [ "!pip install -q requests beautifulsoup4\n", "!pip install -U sentence-transformers" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "00HKz3Is0_me", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "74ee0a26-980a-4d72-ff85-fa37a50d266e" }, "source": [ "import time\n", "import csv\n", "import re\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import requests\n", "import bs4\n", "import lxml.etree as xml\n", "\n", "import pprint\n", "from scipy.spatial.distance import cosine, cdist\n", "\n", "import nltk\n", "nltk.download('punkt')\n", "\n", "from spacy.lang.en import English\n", "nlp = English()\n", "sentencizer = nlp.create_pipe(\"sentencizer\")\n", "nlp.add_pipe(sentencizer)\n", "\n", "from sentence_transformers import SentenceTransformer\n", "\n", "from sklearn.cluster import KMeans\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "%reload_ext google.colab.data_table" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "Ag1EdGqC1I0u", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "03f53658-c355-4175-ac4c-2930ae0bbced" }, "source": [ "URLs = [\"https://www.flexjobs.com/blog/post/job-search-strategies-for-success-v2/\",\n", " \"https://www.best-job-interview.com/job-search-strategy.html\",\n", " \"https://content.wisestep.com/job-search-strategies/\",\n", " \"https://www.thebalancecareers.com/top-strategies-for-a-successful-job-search-2060714\",\n", " \"https://www.monster.com/career-advice/article/a-winning-job-search-strategy\",\n", " \"https://interviewdoctor.com/testimonials/\",\n", " \"https://www.telenor.com/10-tips-for-job-hunting-in-the-digital-age/\",\n", " \"https://www.monster.com/career-advice/article/five-ps-of-job-search-progress\",\n", " ]\n", "\n", "requests.get(URLs[7])" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 17 } ] }, { "cell_type": "markdown", "metadata": { "id": "Da3VLP77aKaA" }, "source": [ "## Part 2 - Scraping" ] }, { "cell_type": "code", "metadata": { "id": "FvrmHyJHXg54" }, "source": [ "df = pd.DataFrame(columns=['title','text'])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "VanjjiMLANOy" }, "source": [ "i = 0\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(name=\"article\", attrs={\"class\": \"single-post-page\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"h2\",\"p\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8QlSMcKTAZg7" }, "source": [ "i = 1\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"id\": \"ContentColumn\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"span\",\"h2\",\"p\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "xTQW5t4TEk3M" }, "source": [ "i = 2\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"class\": \"td-ss-main-content\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"span\",\"h2\",\"p\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "stoQ9rx1KITk" }, "source": [ "i = 3\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"id\": \"list-sc_1-0\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"h2\",\"p\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "6gzWk3g2Goi5" }, "source": [ "i = 4\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"id\": \"mainContent\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"h2\",\"p\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Dk4uOkZcMQEw" }, "source": [ "i = 5\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"class\": \"site-inner\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"blockquote\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NdzVb-1MOwN4" }, "source": [ "i = 6\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"id\": \"primary\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"p\",\"ol\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "BzqvrVb8RwvM" }, "source": [ "i = 7\n", "web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, \"lxml\")\n", "df.loc[i,'title'] = web_page.head.title.text\n", "sub_web_page = web_page.find_all(attrs={\"class\": \"article-content\"})[0]\n", "article = '. '.join([wp.text for wp in sub_web_page.find_all({\"p\",\"h2\"})])\n", "df.loc[i,'text'] = article" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jRzm2wCqOzkb", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "06849979-bc50-42db-9969-4ed45e85a996" }, "source": [ "df = df.dropna().reset_index(drop=True)\n", "df.info()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "\n", "RangeIndex: 8 entries, 0 to 7\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 title 8 non-null object\n", " 1 text 8 non-null object\n", "dtypes: object(2)\n", "memory usage: 256.0+ bytes\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "dHPGx4SzYi26", "outputId": "f9afed13-6e79-449f-a740-b528323149ea" }, "source": [ "df" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titletext
07 Job Search Strategies For Landing Your Next ...Job-hunting can be a tedious task, and it can ...
1Successful Job Search Strategy 2020Home. Job Search Strategy. A good job search s...
220 Effective or Successful Job Search Strategi...Just like most of the things, even the process...
3Top 10 Strategies for a Successful Job Search\\nJob searching isn't just about applying for ...
4Job Hunting | Monster.comTrying to land the right job? Learn how to cre...
5Testimonials of Career and Job Search CoachI found a new role, but before applying I rea...
610 Tips for Job Hunting in the Digital Age - T...On the hunt for a job in this digital age? Tel...
7Job Hunting | Monster.comIf you’re feeling directionless, mastering the...
\n", "
" ], "text/plain": [ " title text\n", "0 7 Job Search Strategies For Landing Your Next ... Job-hunting can be a tedious task, and it can ...\n", "1 Successful Job Search Strategy 2020 Home. Job Search Strategy. A good job search s...\n", "2 20 Effective or Successful Job Search Strategi... Just like most of the things, even the process...\n", "3 Top 10 Strategies for a Successful Job Search \\nJob searching isn't just about applying for ...\n", "4 Job Hunting | Monster.com Trying to land the right job? Learn how to cre...\n", "5 Testimonials of Career and Job Search Coach  I found a new role, but before applying I rea...\n", "6 10 Tips for Job Hunting in the Digital Age - T... On the hunt for a job in this digital age? Tel...\n", "7 Job Hunting | Monster.com If you’re feeling directionless, mastering the..." ] }, "metadata": { "tags": [] }, "execution_count": 53 } ] }, { "cell_type": "markdown", "metadata": { "id": "1tbb-hABaP8V" }, "source": [ "## Part 3 - Text Preprocessing" ] }, { "cell_type": "code", "metadata": { "id": "5qgGGZFwXD-m" }, "source": [ "def tokenize(x):\n", " return nltk.sent_tokenize(x)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "gF5EQ_neRg4V" }, "source": [ "def spacy_tokenize(x):\n", " doc = nlp(x)\n", " return list(doc.sents)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "lx-93M5ZeHtA" }, "source": [ "def sentenize(temp, col = 'text'):\n", " s = temp.apply(lambda x: pd.Series(x[col]),axis=1).stack().reset_index(level=1, drop=True)\n", " s.name = col\n", " temp = temp.drop(col, axis=1).join(s)\n", " return temp" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "eMa-nBckXowB", "outputId": "8041e345-8771-480b-821f-9a87e11251ca" }, "source": [ "temp = df[['text']].copy()\n", "\n", "temp.loc[:,'text'] = temp.text.apply(lambda x: re.sub(r'\\.+', \".\", x))\n", "\n", "temp.loc[:,'text'] = temp['text'].apply(tokenize)\n", "temp = sentenize(temp,'text')\n", "temp.reset_index(inplace=True)\n", "temp.columns = ['para_id','text']\n", "\n", "temp.loc[:,'text'] = temp['text'].apply(spacy_tokenize)\n", "temp = sentenize(temp,'text')\n", "temp.reset_index(drop=True, inplace=True)\n", "\n", "temp = temp.dropna()\n", "\n", "temp.loc[:,'text'] = temp.text.apply(lambda x: x.text.lower())\n", "\n", "temp.loc[:,'text'] = temp['text'].str.replace(\"[^a-zA-Z0-9]\", \" \")\n", "\n", "temp.loc[:,'text'] = temp['text'].dropna()\n", "\n", "temp = temp[temp['text'].str.split().str.len().gt(3)]\n", "\n", "temp = temp.drop_duplicates(subset=['text'], keep='first')\n", "\n", "temp = temp.reset_index(drop=True)\n", "\n", "temp" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
para_idtext
00job hunting can be a tedious task and it can ...
10check out these job search strategies that can...
20job hunting can be a tedious task and without...
30how do you continue to be a focused job seeker...
40create a job hunting strategy
.........
5157could you use some help
5167join monster for free today
5177as a member you can upload up to five version...
5187additionally you can sign up for job alerts s...
5197let monster help you get focused and get hired
\n", "

520 rows × 2 columns

\n", "
" ], "text/plain": [ " para_id text\n", "0 0 job hunting can be a tedious task and it can ...\n", "1 0 check out these job search strategies that can...\n", "2 0 job hunting can be a tedious task and without...\n", "3 0 how do you continue to be a focused job seeker...\n", "4 0 create a job hunting strategy \n", ".. ... ...\n", "515 7 could you use some help \n", "516 7 join monster for free today \n", "517 7 as a member you can upload up to five version...\n", "518 7 additionally you can sign up for job alerts s...\n", "519 7 let monster help you get focused and get hired \n", "\n", "[520 rows x 2 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 57 } ] }, { "cell_type": "markdown", "metadata": { "id": "i7-T48Ataej9" }, "source": [ "## Part 4 - Text clustering using distilbert" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KSBM_wkgZeEp", "outputId": "9fead659-e02a-4105-b3fa-dfc68a9234f6" }, "source": [ "embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')\n", "corpus = temp.text.tolist()\n", "corpus_embeddings = embedder.encode(corpus)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "100%|██████████| 245M/245M [00:18<00:00, 13.2MB/s]\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oVOXz_pMZena", "outputId": "4a925590-b625-4b98-b596-f8b547d66ee2" }, "source": [ "queries = ['customize resume']\n", "query_embeddings = embedder.encode(queries)\n", "for query, query_embedding in zip(queries, query_embeddings):\n", " distances = cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n", " topn_index = distances.argsort()[:5][::-1]\n", " print('Query:', query)\n", " print('Top 5 most similar sentences in corpus:')\n", " for i in topn_index:\n", " pprint.pprint(\"{} (Score: {})\".format(corpus[i], distances[i]))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Query: customize resume\n", "Top 5 most similar sentences in corpus:\n", "'ace the job interview (Score: 0.31260448499322224)'\n", "('prepare visual or video resume generate linkedin profiles etc (Score: '\n", " '0.30055823636076495)')\n", "'fill out your employment history and add skills (Score: 0.2733115555152831)'\n", "'customize your resume and cover letter (Score: 0.14002152061573192)'\n", "'customize your resume and cover letter (Score: 0.14002152061573192)'\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "y0dtT1jBZmMC" }, "source": [ "num_clusters = 20\n", "clustering_model = KMeans(n_clusters=num_clusters)\n", "clustering_model.fit(corpus_embeddings)\n", "cluster_assignment = clustering_model.labels_" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "mkrQFoSVZsvX", "outputId": "23b1f50a-1b1e-4f0a-f0fb-4c3f0a068e2c" }, "source": [ "df = pd.DataFrame(data={\"text\":corpus, \"cluster\":cluster_assignment})\n", "df" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcluster
0job hunting can be a tedious task and it can ...2
1check out these job search strategies that can...13
2job hunting can be a tedious task and without...2
3how do you continue to be a focused job seeker...2
4create a job hunting strategy1
.........
515could you use some help10
516join monster for free today4
517as a member you can upload up to five version...18
518additionally you can sign up for job alerts s...8
519let monster help you get focused and get hired1
\n", "

520 rows × 2 columns

\n", "
" ], "text/plain": [ " text cluster\n", "0 job hunting can be a tedious task and it can ... 2\n", "1 check out these job search strategies that can... 13\n", "2 job hunting can be a tedious task and without... 2\n", "3 how do you continue to be a focused job seeker... 2\n", "4 create a job hunting strategy 1\n", ".. ... ...\n", "515 could you use some help 10\n", "516 join monster for free today 4\n", "517 as a member you can upload up to five version... 18\n", "518 additionally you can sign up for job alerts s... 8\n", "519 let monster help you get focused and get hired 1\n", "\n", "[520 rows x 2 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 62 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 793 }, "id": "3caL-UitZssz", "outputId": "eebf3d59-a58b-449f-f838-0d4dd4eac965" }, "source": [ "c = 0\n", "df.loc[df.cluster==c,:]" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcluster
31it s a unique way to expand your network and p...0
46using large job boards can provide a great way...0
56while networking is always a useful job search...0
80google alerts will save you time and keep you ...0
88there are some definite benefits to sending yo...0
129jotting such things down will help you recogni...0
158you can browse these job boards in order to fi...0
166attending different career fairs is one of the...0
172your mobile can be your best way to land your ...0
173most of the reputed job boards offer mobile ap...0
178networking is still an effective job search st...0
182rank well on google0
183ranking on google is essential not just for a ...0
191such jobs are great to develop new experiences...0
203if you want to land to a reputed job then hav...0
205sharing positive information about your indust...0
241narrowing your search criteria will save time ...0
250rank well on google0
277implementing a diverse job hunting strategy ca...0
278here are seven smart tactics you can use to tr...0
309taking a temp job can help get your foot in th...0
408i ve put together a list of the top ten things...0
471job hunting can feel like an epic journey with...0
495for example attending at least one networking...0
\n", "
" ], "text/plain": [ " text cluster\n", "31 it s a unique way to expand your network and p... 0\n", "46 using large job boards can provide a great way... 0\n", "56 while networking is always a useful job search... 0\n", "80 google alerts will save you time and keep you ... 0\n", "88 there are some definite benefits to sending yo... 0\n", "129 jotting such things down will help you recogni... 0\n", "158 you can browse these job boards in order to fi... 0\n", "166 attending different career fairs is one of the... 0\n", "172 your mobile can be your best way to land your ... 0\n", "173 most of the reputed job boards offer mobile ap... 0\n", "178 networking is still an effective job search st... 0\n", "182 rank well on google 0\n", "183 ranking on google is essential not just for a ... 0\n", "191 such jobs are great to develop new experiences... 0\n", "203 if you want to land to a reputed job then hav... 0\n", "205 sharing positive information about your indust... 0\n", "241 narrowing your search criteria will save time ... 0\n", "250 rank well on google 0\n", "277 implementing a diverse job hunting strategy ca... 0\n", "278 here are seven smart tactics you can use to tr... 0\n", "309 taking a temp job can help get your foot in th... 0\n", "408 i ve put together a list of the top ten things... 0\n", "471 job hunting can feel like an epic journey with... 0\n", "495 for example attending at least one networking... 0" ] }, "metadata": { "tags": [] }, "execution_count": 63 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "bEqryxqyZspl", "outputId": "e5eae5ef-e7bb-4058-e8aa-b43392dfa18f" }, "source": [ "c = 1\n", "df.loc[df.cluster==c,:]" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcluster
4create a job hunting strategy1
13a portfolio is essentially samples of your work1
27include reasons for why you want to work at th...1
28tell them what they will stand to gain if they...1
40fill out your employment history and add skills1
.........
504while you re busy job hunting remember also t...1
507try to relate some of your personality traits ...1
508so for a customer service job you might say ...1
512power up your job search1
519let monster help you get focused and get hired1
\n", "

68 rows × 2 columns

\n", "
" ], "text/plain": [ " text cluster\n", "4 create a job hunting strategy 1\n", "13 a portfolio is essentially samples of your work 1\n", "27 include reasons for why you want to work at th... 1\n", "28 tell them what they will stand to gain if they... 1\n", "40 fill out your employment history and add skills 1\n", ".. ... ...\n", "504 while you re busy job hunting remember also t... 1\n", "507 try to relate some of your personality traits ... 1\n", "508 so for a customer service job you might say ... 1\n", "512 power up your job search 1\n", "519 let monster help you get focused and get hired 1\n", "\n", "[68 rows x 2 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 64 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 886 }, "id": "_PSr77HOZsmx", "outputId": "509d34eb-fa5b-4b92-ac3a-fa7ac8a6e2c4" }, "source": [ "c = 6\n", "df.loc[df.cluster==c,:]" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcluster
19customize your resume and cover letter6
35optimize your linkedin profile6
43use hashtags to make your posts searchable6
50flexjobs for example focuses on flexible and...6
52use these sites to make your search more fruit...6
54take advantage of networking opportunities6
63photo credit bigstockphoto com6
170use your mobile to search for the job6
171don t limit the usage of your mobile to playin...6
176allow your network to work for you6
184prepare visual or video resume generate linke...6
201make sure you have a clean online reputation6
202if there is any content online that reflects p...6
242customize your resume and cover letter6
252use your name for the url if possible6
310let monster pick up some of the slack6
312but did you know monster can also help bring j...6
396telenor group s chief people officer jon erik...6
418because they will google you6
419get ahead and find out what they will uncover ...6
420if some strange images come up in the search ...6
421and remember that it is possible to make cert...6
422tap into your network6
428of course you should be following them on soci...6
439maybe video is your thing maybe an online pho...6
447come up with stories from your experience that...6
502or even better record a video of yourself so...6
\n", "
" ], "text/plain": [ " text cluster\n", "19 customize your resume and cover letter 6\n", "35 optimize your linkedin profile 6\n", "43 use hashtags to make your posts searchable 6\n", "50 flexjobs for example focuses on flexible and... 6\n", "52 use these sites to make your search more fruit... 6\n", "54 take advantage of networking opportunities 6\n", "63 photo credit bigstockphoto com 6\n", "170 use your mobile to search for the job 6\n", "171 don t limit the usage of your mobile to playin... 6\n", "176 allow your network to work for you 6\n", "184 prepare visual or video resume generate linke... 6\n", "201 make sure you have a clean online reputation 6\n", "202 if there is any content online that reflects p... 6\n", "242 customize your resume and cover letter 6\n", "252 use your name for the url if possible 6\n", "310 let monster pick up some of the slack 6\n", "312 but did you know monster can also help bring j... 6\n", "396 telenor group s chief people officer jon erik... 6\n", "418 because they will google you 6\n", "419 get ahead and find out what they will uncover ... 6\n", "420 if some strange images come up in the search ... 6\n", "421 and remember that it is possible to make cert... 6\n", "422 tap into your network 6\n", "428 of course you should be following them on soci... 6\n", "439 maybe video is your thing maybe an online pho... 6\n", "447 come up with stories from your experience that... 6\n", "502 or even better record a video of yourself so... 6" ] }, "metadata": { "tags": [] }, "execution_count": 65 } ] } ] }