In [None]:
import spacy

# 関連ライブラリのimport
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import lightgbm as lgb
#fold数
folds = 5

import pickle
import os
import gc
gc.enable()
import warnings
warnings.filterwarnings("ignore")
import time

import pandas as pd

from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
import re
import matplotlib.pyplot as plt
import numpy as np

import matplotlib
import japanize_matplotlib
import seaborn as sns

import mojimoji
import itertools
import MeCab

from sklearn.metrics.pairwise import cosine_similarity

tagger = MeCab.Tagger('-Owakati')
def tokenize(text):
 return tagger.parse(text).strip()

# 犯罪データ結合

In [None]:
train = pd.read_csv("data/train_processed_add_geocoding_meshcode.csv")
test = pd.read_csv("data/test_processed_add_geocoding_meshcode.csv")

In [None]:
train_and_test = pd.concat([train,test],axis=0)

In [None]:
train_and_test["tokens"] = train_and_test["所在地"].apply(lambda x: tokenize(x))

In [None]:
crime = pd.read_csv("./opendata/犯罪データ.csv").reset_index()

In [None]:
crime["tokens"] = crime["エリア"].apply(lambda x: tokenize(x))

In [None]:
# 普段指定している token_pattern で学習
bow_model_1 = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=1)
bow_model_1.fit(train_and_test["tokens"])
vec1 = bow_model_1.transform(train_and_test["tokens"])
vec2 = bow_model_1.transform(crime["tokens"])
cs_array = pd.DataFrame(np.round(cosine_similarity(vec1, vec2),3))
cs_array["sim_id"] = cs_array.idxmax(axis=1)

In [None]:
cs_array

In [None]:
sample1 = train_and_test[["id", "tokens"]].reset_index()
sample2 = cs_array["sim_id"].reset_index()

sample = pd.concat([sample1, sample2], axis=1).drop("index", axis=1)
sample = pd.merge(sample, crime, left_on="sim_id", right_on="index", how="left")
sample = sample[['凶悪犯', '粗暴犯', '侵入窃盗', '非侵入窃盗', 'その他']]

In [None]:
train_and_test = pd.concat([train_and_test.reset_index(), sample.reset_index()], axis=1).drop("index", axis=1)

In [None]:
train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)
test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)
print(train.shape,test.shape)

In [None]:
train.to_csv("data/train_processed_add_geocoding_meshcode_crime.csv", index=False)
test.to_csv("data/test_processed_add_geocoding_meshcode_crime.csv", index=False)

# trainとtestで似た物件があるか検索

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train = pd.read_csv("data/train.csv").fillna('No Data')
test = pd.read_csv("data/test.csv").fillna('No Data')

In [None]:
train["text"] = train["所在地"] +" "+ train["間取り"] +" "+ \
 train["築年数"] +" "+ train["面積"] +" "+ train["建物構造"]
test["text"] = test["所在地"] +" "+ test["間取り"] +" "+ \
 test["築年数"] +" "+ test["面積"] +" "+ test["建物構造"]

In [None]:
train["tokens"] = train["text"].apply(lambda x: tokenize(x))
test["tokens"] = test["text"].apply(lambda x: tokenize(x))
train_and_test = pd.concat([train,test],axis=0)

In [None]:
bow_model_1 = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=1)
bow_model_1.fit(train_and_test["tokens"])
vec1 = bow_model_1.transform(train["tokens"])
vec2 = bow_model_1.transform(test["tokens"])
cs_array = pd.DataFrame(np.round(cosine_similarity(vec1, vec2),3))

In [None]:
# モデルの生成
vectorizer = TfidfVectorizer(smooth_idf = False)
# TF-IDFの計算
tfidf = vectorizer.fit(train_and_test["tokens"])
vec1 = tfidf.transform(train["tokens"])
vec2 = tfidf.transform(test["tokens"])
cs_array = pd.DataFrame(np.round(cosine_similarity(vec1, vec2),3))

In [None]:
cs_array[cs_array[2] == cs_array[2].max()].index

In [None]:
d = []
for i, max_sim in enumerate(cs_array.max()):
# print(i, max_sim)
 if max_sim < 0.95:
 continue;
 indexs = cs_array[cs_array[i] == max_sim].index
 
# print("test:" + test["text"][i])
# print("train:" + train["text"][indexs])
# print("-----賃料-----")
# print(train["賃料"][indexs])
# print(train["賃料"][indexs].mean())
 d.append([i, max_sim, test["text"][i], train["text"][indexs],list(train["賃料"][indexs]), train["賃料"][indexs].mean(), list(train["賃料"][indexs])[0]==train["賃料"][indexs].mean()])

In [None]:
submission_convert = pd.DataFrame(d)
submission_convert.columns = ["test_index", "sim_score", "test_text", "train_text", "train_賃料", "train_平均賃料", "flg"]

In [None]:
submission_convert = submission_convert[submission_convert["flg"]==True]

In [None]:
submission = pd.read_csv("submission_lgbm.csv", header=None).reset_index()

In [None]:
sample = pd.merge(submission, submission_convert, left_on="index", right_on="test_index", how="left")

In [None]:
sample["差額"] = sample[1] - sample["train_平均賃料"]

In [None]:
sample.sort_values("差額")

In [None]:
sample["賃料"] = sample['train_平均賃料'].where(sample['train_平均賃料'].isnull()==False, sample[1])
sample = sample[[0, "賃料"]]
sample

In [None]:
sample.to_csv("submission_lgbm_replace.csv", sep=",", index=False, header=None)