# 構造化データと非構造化データを利用してマッチングを行う事例

### 関連リンク
* [サンプル一覧](https://cloud.ibm.com/docs/services/natural-language-understanding?topic=natural-language-understanding-sample-apps#sample-apps)
* [オリジナル](https://developer.ibm.com/patterns/generate-insights-from-multiple-data-formats-using-watson-services/)
* [解説(Github)](https://github.com/IBM/generate-insights-from-data-formats-with-watson/blob/master/README.md)



###  必要ライブラリの導入

1. 設定　
パッケージの導入、APIの認証取得など

In [None]:
!pip install nltk

In [None]:
!pip install -U watson-developer-cloud

In [None]:
!pip install PyPDF2 

In [None]:
!pip install mammoth

In [None]:
import nltk
nltk.download('all')

### 2. パッケージのimport

必要ライブラリのimport

In [None]:
import pandas as pd
import json
import re
import nltk
from botocore.client import Config
import ibm_boto3
import zipfile
from io import BytesIO
from IPython.display import display, HTML

import os, sys, glob, mammoth

from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
  import Features, EntitiesOptions, SemanticRolesOptions, RelationsOptions, KeywordsOptions

import PyPDF2 

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

### COSの認証情報

sample_config.txt ファイルから**Insert Credentials**  
でコード自動生成します。  
変数がcredentials_1でない場合はcredentials_1に変更します。

In [None]:
# セル自動生成


### cosアクセス用関数

In [None]:
cos = ibm_boto3.client('s3',
                    ibm_api_key_id=credentials_1['IBM_API_KEY_ID'],
                    ibm_service_instance_id=credentials_1['IAM_SERVICE_ID'],
                    ibm_auth_endpoint=credentials_1['IBM_AUTH_ENDPOINT'],
                    config=Config(signature_version='oauth'),
                    endpoint_url=credentials_1['ENDPOINT'])

def get_file(filename):
    '''Retrieve file from Cloud Object Storage'''
    fileobject = cos.get_object(Bucket=credentials_1['BUCKET'], Key=filename)['Body']
    return fileobject
    
def get_docx_file():
    '''Retrieve file '''
    docx_files=[]
    zip_ref = zipfile.ZipFile(BytesIO(streaming_body_1.read()),'r')
    paths = zip_ref.namelist()
    for path in paths:
        file=zip_ref.extract(path)
        docx_files.append(file)
    return docx_files


def load_string(fileobject):
    '''Load the file contents into a Python string'''

    text = fileobject.read()
    return text

def load_df(fileobject,sheetname):
    '''Load file contents into a Pandas dataframe'''
    excelFile = pd.ExcelFile(fileobject)
    df = excelFile.parse(sheetname)
    return df

def put_file(filename, filecontents):
    '''Write file to Cloud Object Storage'''
    resp = cos.put_object(Bucket=credentials_1['BUCKET'], Key=filename, Body=filecontents)
    return resp

### sample_config.txtファイルのロード

In [None]:
config_classification_json= json.loads(load_string(get_file(credentials_1['FILE'])).decode("utf-8"))

### 構造化データ Data.csvをロードしてデータフレーム(stu_df)に読み込む
Data.csvから**Insert pandas DataFrame**でコードを自動生成します。  
df_data_1 は stu_dfに変更します。

In [None]:
# セル自動生成


### 3. NLUの認証情報
iam_apikeyの行を修正します。

In [None]:
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2018-11-16',
    iam_apikey='xxxx'
    url='https://gateway.watsonplatform.net/natural-language-understanding/api'
  )

### 4. NLUを利用した関数定義

In [None]:
def analyze_using_NLU(text_content):
    '''
    Call Watson Natural Language Understanding service to obtain analysis results.
    '''
    response = natural_language_understanding.analyze(
        text= text_content,
        features=Features(
        entities=EntitiesOptions(),
        relations=RelationsOptions(),
        keywords= KeywordsOptions())
    )
    return response

In [None]:
def split_sentences(text):
    """ Split text into sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?]')
    sentences = sentence_delimiters.split(text)
    return sentences

def split_into_tokens(text):
    """ Split text into tokens.
    """
    tokens = nltk.word_tokenize(text)
    return tokens
    
def POS_tagging(text):
    """ Generate Part of speech tagging of the text.
    """
    POSofText = nltk.tag.pos_tag(text)
    return POSofText

def keyword_tagging(tag,tagtext,text):
    """ Tag the text matching keywords.
    """
    if (text.lower().find(tagtext.lower()) != -1):
        return text[text.lower().find(tagtext.lower()):text.lower().find(tagtext.lower())+len(tagtext)]
    else:
        return 'UNKNOWN'
    
def regex_tagging(tag,regex,text):
    """ Tag the text matching REGEX.
    """    
    p = re.compile(regex, re.IGNORECASE)
    matchtext = p.findall(text)
    regex_list=[]    
    if (len(matchtext)>0):
        for regword in matchtext:
            regex_list.append(regword)
    return regex_list

def chunk_tagging(tag,chunk,text):
    """ Tag the text using chunking.
    """
    parsed_cp = nltk.RegexpParser(chunk)
    pos_cp = parsed_cp.parse(text)
    chunk_list=[]
    for root in pos_cp:
        if isinstance(root, nltk.tree.Tree):               
            if root.label() == tag:
                chunk_word = ''
                for child_root in root:
                    chunk_word = chunk_word +' '+ child_root[0]
                chunk_list.append(chunk_word)
    return chunk_list
    
def augument_NLUResponse(responsejson,updateType,text,tag):
    """ Update the NLU response JSON with augumented classifications.
    """
    if(updateType == 'keyword'):
        if not any(d.get('text', None) == text for d in responsejson['keywords']):
            responsejson['keywords'].append({"text":text,"relevance":0.5})
    else:
        if not any(d.get('text', None) == text for d in responsejson['entities']):
            responsejson['entities'].append({"type":tag,"text":text,"relevance":0.5,"count":1})        
    

def classify_text(text, config):
    """ Perform augumented classification of the text.
    """
    
    response = analyze_using_NLU(text)
    responsejson = response.get_result()
    
    sentenceList = split_sentences(text)
    
    tokens = split_into_tokens(text)
    
    postags = POS_tagging(tokens)
    
    configjson = config
    
    for stages in configjson['configuration']['classification']['stages']:
        for steps in stages['steps']:
            if (steps['type'] == 'keywords'):
                for keyword in steps['keywords']:
                    for word in sentenceList:
                        wordtag = keyword_tagging(keyword['tag'],keyword['text'],word)
                        if(wordtag != 'UNKNOWN'):
                            augument_NLUResponse(responsejson,'entities',wordtag,keyword['tag'])
            elif(steps['type'] == 'd_regex'):
                for regex in steps['d_regex']:
                    for word in sentenceList:
                        regextags = regex_tagging(regex['tag'],regex['pattern'],word)
                        if (len(regextags)>0):
                            for words in regextags:
                                augument_NLUResponse(responsejson,'entities',words,regex['tag'])
            elif(steps['type'] == 'chunking'):
                for chunk in steps['chunk']:
                    chunktags = chunk_tagging(chunk['tag'],chunk['pattern'],postags)
                    if (len(chunktags)>0):
                        for words in chunktags:
                            augument_NLUResponse(responsejson,'entities',words,chunk['tag'])
            else:
                print('UNKNOWN STEP')
    
    return responsejson

def replace_unicode_strings(response):
    """ Convert dict with unicode strings to strings.
    """
    if isinstance(response, dict):
        return {replace_unicode_strings(key): replace_unicode_strings(value) for key, value in response.iteritems()}
    elif isinstance(response, list):
        return [replace_unicode_strings(element) for element in response]
    elif isinstance(response, unicode):
        return response.encode('utf-8')
    else:
        return response

### 5. Job Descriptionから要件を抽出する

In [None]:
job_description_1 = \
 "I need a candidate with User Experience Design skills and experience should be more than 24 months. "
job_description_2 = \
 "I need a candidate with Machine Learning Expert skills and experience should be more than 27 months."
job_description = [job_description_1, job_description_2]

In [None]:
def getrequirements(job_description):
    requirement_jd=[]
    for i in job_description:
        requirement_jd.append(classify_text(i,config_classification_json))
    return requirement_jd

def getRequiredCandidateEntityList(requirement_jd):
    RequiredCandidateEntityList =[]
    text_type=''
    text_value = ''
    entity_dict={}
    for i in requirement_jd:
        entity_dict={}
        for k in i['entities']:
            for key1, value1 in k.items():
                if(key1=='type'):
                    text_type = value1
                if(key1=='text'):
                    text_value = value1
            entity_dict[text_type] = text_value
        RequiredCandidateEntityList.append(entity_dict)
    return RequiredCandidateEntityList

def getskills_matching_candidates(RequiredCandidateEntityList):
    '''
    Filtering the Candidates matching with the required skills.
    '''
    skills_matching_candidates =[]
    row_list = []
    for i in RequiredCandidateEntityList:
        requirement_1= i['NAME'].lstrip()
        requirement_2= i['Quantity'].lstrip()
        for index, row in stu_df.iterrows():
            if '/' or ',' in row['Skills']:
                if requirement_1 in list(re.split('\/|,',row['Skills'])):
                    row_list.append(row)
                    skills_matching_candidates.append(row['Name'])
            else:
                if(requirement_1 in row['Skills']):
                    row_list.append(row)
                    skills_matching_candidates.append(row['Name'])
    return row_list

### 要件サマリー

In [None]:
requirement_jd = getrequirements(job_description)
requirement_jd

In [None]:
RequiredCandidateEntityList = getRequiredCandidateEntityList(requirement_jd)
RequiredCandidateEntityList

### スキルのマッチした候補者の抽出

In [None]:
row_list = getskills_matching_candidates(RequiredCandidateEntityList)
row_list
filtered_dataframe = pd.DataFrame(row_list)
filtered_dataframe

### 6. 候補者レジメの処理

#### data_files.zipをストリームに読み込む
data_files.zipを解凍したファイルは、候補者別のプロファイルになっています。  
data_diles.zip -> **Insert StreamingBody object**でコードを自動生成します。  
streaming_body_2をstreaming_body_1に直します。

In [None]:
# セル自動生成後修正


### data_files.zipの解凍

In [None]:
zip_ref = zipfile.ZipFile(BytesIO(streaming_body_1.read()),'r')
paths = zip_ref.namelist()

for path in paths:
    print('path:' , path)
    factsheet=zip_ref.extract(path)

path = '/home/dsxuser/work/'

In [None]:
# レジメ一覧の表示
print(paths)

In [None]:
# レジメからテキストデータ抽出

def extractingTextfromresumes():
    '''Extracting Text from the pool of resumes(processing word docs and pdfs)
    '''


    matching_candidates_text = []
    filenames = glob.glob(path+'/*.pdf')
    filenames_docx= glob.glob(path+'/*.docx')

    for filename in filenames:
        print(filename)
        pdfFileObj = open(filename,'rb')
        #The pdfReader variable is a readable object that will be parsed
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        #discerning the number of pages will allow us to parse through all #the pages
        num_pages = pdfReader.numPages
        count = 0
        text = ""
        #The while loop will read each page
        while count < num_pages:
            pageObj = pdfReader.getPage(count)
            count +=1
            text += pageObj.extractText()
        
       
        matching_candidates_text.append(text)

    for filename in filenames_docx:
        print(filename)
        with open(filename, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value # The raw text
            messages = result.messages # Any messages
            matching_candidates_text.append(text)
            
    return matching_candidates_text

def processTheTextWithWatsonNLU(matching_candidates_text):
    '''
    Process the text with Watson NLU
    '''
    NLU_Results_Matched_Candidates = []
    for text in matching_candidates_text:
        json = classify_text(text,config_classification_json)
        NLU_Results_Matched_Candidates.append(json)
    return NLU_Results_Matched_Candidates


def unstructuredTexttoadataframe(NLU_Results_Matched_Candidates):
    '''
    Convert the unstructured text(entities in the result of NLU) to a dataframe
    '''
    matchedCandidateEntityList =[]
    entity_dict={}
    text_type=''
    text_value = ''
    for i in NLU_Results_Matched_Candidates:
        entity_dict={}
        for k in i['entities']:
            for key1, value1 in k.items():
                if(key1=='type'):
                    text_type = value1
                if(key1=='text'):
                    text_value = value1
            entity_dict[text_type] = text_value
        matchedCandidateEntityList.append(entity_dict)
    return matchedCandidateEntityList

In [None]:
matching_candidates_text = extractingTextfromresumes()
matching_candidates_text

In [None]:
NLU_Results_Matched_Candidates = processTheTextWithWatsonNLU(matching_candidates_text)

In [None]:
NLU_Results_Matched_Candidates

In [None]:
matchedCandidateEntityList = unstructuredTexttoadataframe(NLU_Results_Matched_Candidates)
matchedCandidateEntityList

In [None]:
resume_df = pd.DataFrame(matchedCandidateEntityList)
resume_df

### 7.  推薦者の表示

In [None]:
def Recommendation(resume_df):
    recommendation=[]
    display(HTML('<!DOCTYPE html><html><title>W3.CSS</title><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css"><h2>Recommendation</h2></html>'))
    for index, row in resume_df.iterrows():
        if (int(row['PhoneNumber']) in list(filtered_dataframe['Handphone'])):
            applied_before = filtered_dataframe[filtered_dataframe['Handphone'] == int(row['PhoneNumber'])]['Applied Before'].iloc[0]
            comments = filtered_dataframe[filtered_dataframe['Handphone'] == int(row['PhoneNumber'])]['Comments'].iloc[0]

            name = filtered_dataframe[filtered_dataframe['Handphone'] == int(row['PhoneNumber'])]['Name'].iloc[0]

            if(applied_before.lower() == 'yes'):
                    print_card = "Candidate "+ name +" "+comments
                    display(HTML('<!DOCTYPE html><html><title>W3.CSS</title><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css"><body><div class="w3-container"><div class="w3-panel w3-card w3-red"><p>'+ print_card +'</p></div></div></body></html>'))            

In [None]:
Recommendation(resume_df)

#### We can observe that for the first requirement, C1 & C10 were eligible however as per recommendation C1 had applied before and did not accept offer which makes C10 ideal candidate to accept the offer if selected. For the second requirement, C11 & C14 are eligible however the recommendation is to select C11 given the post graduate degree of specialization who will be a better fit for the role. 