{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 構造化データと非構造化データを利用してマッチングを行う事例\n", "\n", "### 関連リンク\n", "* [サンプル一覧](https://cloud.ibm.com/docs/services/natural-language-understanding?topic=natural-language-understanding-sample-apps#sample-apps)\n", "* [オリジナル](https://developer.ibm.com/patterns/generate-insights-from-multiple-data-formats-using-watson-services/)\n", "* [解説(Github)](https://github.com/IBM/generate-insights-from-data-formats-with-watson/blob/master/README.md)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 必要ライブラリの導入\n", "\n", "1. 設定 \n", "パッケージの導入、APIの認証取得など" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install nltk" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -U watson-developer-cloud" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install PyPDF2 " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install mammoth" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "nltk.download('all')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. パッケージのimport\n", "\n", "必要ライブラリのimport" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import re\n", "import nltk\n", "from botocore.client import Config\n", "import ibm_boto3\n", "import zipfile\n", "from io import BytesIO\n", "from IPython.display import display, HTML\n", "\n", "import os, sys, glob, mammoth\n", "\n", "from watson_developer_cloud import NaturalLanguageUnderstandingV1\n", "from watson_developer_cloud.natural_language_understanding_v1 \\\n", " import Features, EntitiesOptions, SemanticRolesOptions, RelationsOptions, KeywordsOptions\n", "\n", "import PyPDF2 \n", "\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### COSの認証情報\n", "\n", "sample_config.txt ファイルから**Insert Credentials** \n", "でコード自動生成します。 \n", "変数がcredentials_1でない場合はcredentials_1に変更します。" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# セル自動生成\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### cosアクセス用関数" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cos = ibm_boto3.client('s3',\n", " ibm_api_key_id=credentials_1['IBM_API_KEY_ID'],\n", " ibm_service_instance_id=credentials_1['IAM_SERVICE_ID'],\n", " ibm_auth_endpoint=credentials_1['IBM_AUTH_ENDPOINT'],\n", " config=Config(signature_version='oauth'),\n", " endpoint_url=credentials_1['ENDPOINT'])\n", "\n", "def get_file(filename):\n", " '''Retrieve file from Cloud Object Storage'''\n", " fileobject = cos.get_object(Bucket=credentials_1['BUCKET'], Key=filename)['Body']\n", " return fileobject\n", " \n", "def get_docx_file():\n", " '''Retrieve file '''\n", " docx_files=[]\n", " zip_ref = zipfile.ZipFile(BytesIO(streaming_body_1.read()),'r')\n", " paths = zip_ref.namelist()\n", " for path in paths:\n", " file=zip_ref.extract(path)\n", " docx_files.append(file)\n", " return docx_files\n", "\n", "\n", "def load_string(fileobject):\n", " '''Load the file contents into a Python string'''\n", "\n", " text = fileobject.read()\n", " return text\n", "\n", "def load_df(fileobject,sheetname):\n", " '''Load file contents into a Pandas dataframe'''\n", " excelFile = pd.ExcelFile(fileobject)\n", " df = excelFile.parse(sheetname)\n", " return df\n", "\n", "def put_file(filename, filecontents):\n", " '''Write file to Cloud Object Storage'''\n", " resp = cos.put_object(Bucket=credentials_1['BUCKET'], Key=filename, Body=filecontents)\n", " return resp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sample_config.txtファイルのロード" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "config_classification_json= json.loads(load_string(get_file(credentials_1['FILE'])).decode(\"utf-8\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 構造化データ Data.csvをロードしてデータフレーム(stu_df)に読み込む\n", "Data.csvから**Insert pandas DataFrame**でコードを自動生成します。 \n", "df_data_1 は stu_dfに変更します。" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# セル自動生成\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. NLUの認証情報\n", "iam_apikeyの行を修正します。" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "natural_language_understanding = NaturalLanguageUnderstandingV1(\n", " version='2018-11-16',\n", " iam_apikey='xxxx'\n", " url='https://gateway.watsonplatform.net/natural-language-understanding/api'\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. NLUを利用した関数定義" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def analyze_using_NLU(text_content):\n", " '''\n", " Call Watson Natural Language Understanding service to obtain analysis results.\n", " '''\n", " response = natural_language_understanding.analyze(\n", " text= text_content,\n", " features=Features(\n", " entities=EntitiesOptions(),\n", " relations=RelationsOptions(),\n", " keywords= KeywordsOptions())\n", " )\n", " return response" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def split_sentences(text):\n", " \"\"\" Split text into sentences.\n", " \"\"\"\n", " sentence_delimiters = re.compile(u'[\\\\[\\\\]\\n.!?]')\n", " sentences = sentence_delimiters.split(text)\n", " return sentences\n", "\n", "def split_into_tokens(text):\n", " \"\"\" Split text into tokens.\n", " \"\"\"\n", " tokens = nltk.word_tokenize(text)\n", " return tokens\n", " \n", "def POS_tagging(text):\n", " \"\"\" Generate Part of speech tagging of the text.\n", " \"\"\"\n", " POSofText = nltk.tag.pos_tag(text)\n", " return POSofText\n", "\n", "def keyword_tagging(tag,tagtext,text):\n", " \"\"\" Tag the text matching keywords.\n", " \"\"\"\n", " if (text.lower().find(tagtext.lower()) != -1):\n", " return text[text.lower().find(tagtext.lower()):text.lower().find(tagtext.lower())+len(tagtext)]\n", " else:\n", " return 'UNKNOWN'\n", " \n", "def regex_tagging(tag,regex,text):\n", " \"\"\" Tag the text matching REGEX.\n", " \"\"\" \n", " p = re.compile(regex, re.IGNORECASE)\n", " matchtext = p.findall(text)\n", " regex_list=[] \n", " if (len(matchtext)>0):\n", " for regword in matchtext:\n", " regex_list.append(regword)\n", " return regex_list\n", "\n", "def chunk_tagging(tag,chunk,text):\n", " \"\"\" Tag the text using chunking.\n", " \"\"\"\n", " parsed_cp = nltk.RegexpParser(chunk)\n", " pos_cp = parsed_cp.parse(text)\n", " chunk_list=[]\n", " for root in pos_cp:\n", " if isinstance(root, nltk.tree.Tree): \n", " if root.label() == tag:\n", " chunk_word = ''\n", " for child_root in root:\n", " chunk_word = chunk_word +' '+ child_root[0]\n", " chunk_list.append(chunk_word)\n", " return chunk_list\n", " \n", "def augument_NLUResponse(responsejson,updateType,text,tag):\n", " \"\"\" Update the NLU response JSON with augumented classifications.\n", " \"\"\"\n", " if(updateType == 'keyword'):\n", " if not any(d.get('text', None) == text for d in responsejson['keywords']):\n", " responsejson['keywords'].append({\"text\":text,\"relevance\":0.5})\n", " else:\n", " if not any(d.get('text', None) == text for d in responsejson['entities']):\n", " responsejson['entities'].append({\"type\":tag,\"text\":text,\"relevance\":0.5,\"count\":1}) \n", " \n", "\n", "def classify_text(text, config):\n", " \"\"\" Perform augumented classification of the text.\n", " \"\"\"\n", " \n", " response = analyze_using_NLU(text)\n", " responsejson = response.get_result()\n", " \n", " sentenceList = split_sentences(text)\n", " \n", " tokens = split_into_tokens(text)\n", " \n", " postags = POS_tagging(tokens)\n", " \n", " configjson = config\n", " \n", " for stages in configjson['configuration']['classification']['stages']:\n", " for steps in stages['steps']:\n", " if (steps['type'] == 'keywords'):\n", " for keyword in steps['keywords']:\n", " for word in sentenceList:\n", " wordtag = keyword_tagging(keyword['tag'],keyword['text'],word)\n", " if(wordtag != 'UNKNOWN'):\n", " augument_NLUResponse(responsejson,'entities',wordtag,keyword['tag'])\n", " elif(steps['type'] == 'd_regex'):\n", " for regex in steps['d_regex']:\n", " for word in sentenceList:\n", " regextags = regex_tagging(regex['tag'],regex['pattern'],word)\n", " if (len(regextags)>0):\n", " for words in regextags:\n", " augument_NLUResponse(responsejson,'entities',words,regex['tag'])\n", " elif(steps['type'] == 'chunking'):\n", " for chunk in steps['chunk']:\n", " chunktags = chunk_tagging(chunk['tag'],chunk['pattern'],postags)\n", " if (len(chunktags)>0):\n", " for words in chunktags:\n", " augument_NLUResponse(responsejson,'entities',words,chunk['tag'])\n", " else:\n", " print('UNKNOWN STEP')\n", " \n", " return responsejson\n", "\n", "def replace_unicode_strings(response):\n", " \"\"\" Convert dict with unicode strings to strings.\n", " \"\"\"\n", " if isinstance(response, dict):\n", " return {replace_unicode_strings(key): replace_unicode_strings(value) for key, value in response.iteritems()}\n", " elif isinstance(response, list):\n", " return [replace_unicode_strings(element) for element in response]\n", " elif isinstance(response, unicode):\n", " return response.encode('utf-8')\n", " else:\n", " return response" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5. Job Descriptionから要件を抽出する" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "job_description_1 = \\\n", " \"I need a candidate with User Experience Design skills and experience should be more than 24 months. \"\n", "job_description_2 = \\\n", " \"I need a candidate with Machine Learning Expert skills and experience should be more than 27 months.\"\n", "job_description = [job_description_1, job_description_2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def getrequirements(job_description):\n", " requirement_jd=[]\n", " for i in job_description:\n", " requirement_jd.append(classify_text(i,config_classification_json))\n", " return requirement_jd\n", "\n", "def getRequiredCandidateEntityList(requirement_jd):\n", " RequiredCandidateEntityList =[]\n", " text_type=''\n", " text_value = ''\n", " entity_dict={}\n", " for i in requirement_jd:\n", " entity_dict={}\n", " for k in i['entities']:\n", " for key1, value1 in k.items():\n", " if(key1=='type'):\n", " text_type = value1\n", " if(key1=='text'):\n", " text_value = value1\n", " entity_dict[text_type] = text_value\n", " RequiredCandidateEntityList.append(entity_dict)\n", " return RequiredCandidateEntityList\n", "\n", "def getskills_matching_candidates(RequiredCandidateEntityList):\n", " '''\n", " Filtering the Candidates matching with the required skills.\n", " '''\n", " skills_matching_candidates =[]\n", " row_list = []\n", " for i in RequiredCandidateEntityList:\n", " requirement_1= i['NAME'].lstrip()\n", " requirement_2= i['Quantity'].lstrip()\n", " for index, row in stu_df.iterrows():\n", " if '/' or ',' in row['Skills']:\n", " if requirement_1 in list(re.split('\\/|,',row['Skills'])):\n", " row_list.append(row)\n", " skills_matching_candidates.append(row['Name'])\n", " else:\n", " if(requirement_1 in row['Skills']):\n", " row_list.append(row)\n", " skills_matching_candidates.append(row['Name'])\n", " return row_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 要件サマリー" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "requirement_jd = getrequirements(job_description)\n", "requirement_jd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "RequiredCandidateEntityList = getRequiredCandidateEntityList(requirement_jd)\n", "RequiredCandidateEntityList" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### スキルのマッチした候補者の抽出" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "row_list = getskills_matching_candidates(RequiredCandidateEntityList)\n", "row_list\n", "filtered_dataframe = pd.DataFrame(row_list)\n", "filtered_dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6. 候補者レジメの処理" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### data_files.zipをストリームに読み込む\n", "data_files.zipを解凍したファイルは、候補者別のプロファイルになっています。 \n", "data_diles.zip -> **Insert StreamingBody object**でコードを自動生成します。 \n", "streaming_body_2をstreaming_body_1に直します。" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# セル自動生成後修正\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### data_files.zipの解凍" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "zip_ref = zipfile.ZipFile(BytesIO(streaming_body_1.read()),'r')\n", "paths = zip_ref.namelist()\n", "\n", "for path in paths:\n", " print('path:' , path)\n", " factsheet=zip_ref.extract(path)\n", "\n", "path = '/home/dsxuser/work/'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# レジメ一覧の表示\n", "print(paths)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# レジメからテキストデータ抽出\n", "\n", "def extractingTextfromresumes():\n", " '''Extracting Text from the pool of resumes(processing word docs and pdfs)\n", " '''\n", "\n", "\n", " matching_candidates_text = []\n", " filenames = glob.glob(path+'/*.pdf')\n", " filenames_docx= glob.glob(path+'/*.docx')\n", "\n", " for filename in filenames:\n", " print(filename)\n", " pdfFileObj = open(filename,'rb')\n", " #The pdfReader variable is a readable object that will be parsed\n", " pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n", " #discerning the number of pages will allow us to parse through all #the pages\n", " num_pages = pdfReader.numPages\n", " count = 0\n", " text = \"\"\n", " #The while loop will read each page\n", " while count < num_pages:\n", " pageObj = pdfReader.getPage(count)\n", " count +=1\n", " text += pageObj.extractText()\n", " \n", " \n", " matching_candidates_text.append(text)\n", "\n", " for filename in filenames_docx:\n", " print(filename)\n", " with open(filename, \"rb\") as docx_file:\n", " result = mammoth.extract_raw_text(docx_file)\n", " text = result.value # The raw text\n", " messages = result.messages # Any messages\n", " matching_candidates_text.append(text)\n", " \n", " return matching_candidates_text\n", "\n", "def processTheTextWithWatsonNLU(matching_candidates_text):\n", " '''\n", " Process the text with Watson NLU\n", " '''\n", " NLU_Results_Matched_Candidates = []\n", " for text in matching_candidates_text:\n", " json = classify_text(text,config_classification_json)\n", " NLU_Results_Matched_Candidates.append(json)\n", " return NLU_Results_Matched_Candidates\n", "\n", "\n", "def unstructuredTexttoadataframe(NLU_Results_Matched_Candidates):\n", " '''\n", " Convert the unstructured text(entities in the result of NLU) to a dataframe\n", " '''\n", " matchedCandidateEntityList =[]\n", " entity_dict={}\n", " text_type=''\n", " text_value = ''\n", " for i in NLU_Results_Matched_Candidates:\n", " entity_dict={}\n", " for k in i['entities']:\n", " for key1, value1 in k.items():\n", " if(key1=='type'):\n", " text_type = value1\n", " if(key1=='text'):\n", " text_value = value1\n", " entity_dict[text_type] = text_value\n", " matchedCandidateEntityList.append(entity_dict)\n", " return matchedCandidateEntityList" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "matching_candidates_text = extractingTextfromresumes()\n", "matching_candidates_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "NLU_Results_Matched_Candidates = processTheTextWithWatsonNLU(matching_candidates_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "NLU_Results_Matched_Candidates" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "matchedCandidateEntityList = unstructuredTexttoadataframe(NLU_Results_Matched_Candidates)\n", "matchedCandidateEntityList" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "resume_df = pd.DataFrame(matchedCandidateEntityList)\n", "resume_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7. 推薦者の表示" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def Recommendation(resume_df):\n", " recommendation=[]\n", " display(HTML('
'+ print_card +'