{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "68aaa804", "metadata": {}, "outputs": [], "source": [ "import spacy\n", "\n", "# 関連ライブラリのimport\n", "from sklearn.model_selection import KFold, StratifiedKFold\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import lightgbm as lgb\n", "#fold数\n", "folds = 5\n", "\n", "import pickle\n", "import os\n", "import gc\n", "gc.enable()\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "import time\n", "\n", "import pandas as pd\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "from collections import defaultdict\n", "import re\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "import matplotlib\n", "import japanize_matplotlib\n", "import seaborn as sns\n", "\n", "import mojimoji\n", "import itertools\n", "import MeCab\n", "\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "tagger = MeCab.Tagger('-Owakati')\n", "def tokenize(text):\n", " return tagger.parse(text).strip()" ] }, { "cell_type": "markdown", "id": "370bb91c", "metadata": {}, "source": [ "# 犯罪データ結合" ] }, { "cell_type": "code", "execution_count": null, "id": "4963ed25", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train_processed_add_geocoding_meshcode.csv\")\n", "test = pd.read_csv(\"data/test_processed_add_geocoding_meshcode.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4c0788b0", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.concat([train,test],axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "4bce7bb7", "metadata": {}, "outputs": [], "source": [ "train_and_test[\"tokens\"] = train_and_test[\"所在地\"].apply(lambda x: tokenize(x))" ] }, { "cell_type": "code", "execution_count": null, "id": "566570e9", "metadata": {}, "outputs": [], "source": [ "crime = pd.read_csv(\"./opendata/犯罪データ.csv\").reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "0058672a", "metadata": {}, "outputs": [], "source": [ "crime[\"tokens\"] = crime[\"エリア\"].apply(lambda x: tokenize(x))" ] }, { "cell_type": "code", "execution_count": null, "id": "b13d7f41", "metadata": {}, "outputs": [], "source": [ "# 普段指定している token_pattern で学習\n", "bow_model_1 = CountVectorizer(token_pattern='(?u)\\\\b\\\\w+\\\\b', min_df=1)\n", "bow_model_1.fit(train_and_test[\"tokens\"])\n", "vec1 = bow_model_1.transform(train_and_test[\"tokens\"])\n", "vec2 = bow_model_1.transform(crime[\"tokens\"])\n", "cs_array = pd.DataFrame(np.round(cosine_similarity(vec1, vec2),3))\n", "cs_array[\"sim_id\"] = cs_array.idxmax(axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "868d2ab8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8373e13e", "metadata": {}, "outputs": [], "source": [ "cs_array" ] }, { "cell_type": "code", "execution_count": null, "id": "5390ac25", "metadata": {}, "outputs": [], "source": [ "sample1 = train_and_test[[\"id\", \"tokens\"]].reset_index()\n", "sample2 = cs_array[\"sim_id\"].reset_index()\n", "\n", "sample = pd.concat([sample1, sample2], axis=1).drop(\"index\", axis=1)\n", "sample = pd.merge(sample, crime, left_on=\"sim_id\", right_on=\"index\", how=\"left\")\n", "sample = sample[['凶悪犯', '粗暴犯', '侵入窃盗', '非侵入窃盗', 'その他']]" ] }, { "cell_type": "code", "execution_count": null, "id": "6b4c6c6c", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.concat([train_and_test.reset_index(), sample.reset_index()], axis=1).drop(\"index\", axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "876537e6", "metadata": {}, "outputs": [], "source": [ "train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)\n", "test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)\n", "print(train.shape,test.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "e1fdfc64", "metadata": {}, "outputs": [], "source": [ "train.to_csv(\"data/train_processed_add_geocoding_meshcode_crime.csv\", index=False)\n", "test.to_csv(\"data/test_processed_add_geocoding_meshcode_crime.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3bd57a8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0781b0c4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "be565860", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "28db6b23", "metadata": {}, "source": [ "# trainとtestで似た物件があるか検索" ] }, { "cell_type": "code", "execution_count": null, "id": "4e9320df", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": null, "id": "722d15ec", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train.csv\").fillna('No Data')\n", "test = pd.read_csv(\"data/test.csv\").fillna('No Data')" ] }, { "cell_type": "code", "execution_count": null, "id": "a2811396", "metadata": {}, "outputs": [], "source": [ "train[\"text\"] = train[\"所在地\"] +\" \"+ train[\"間取り\"] +\" \"+ \\\n", " train[\"築年数\"] +\" \"+ train[\"面積\"] +\" \"+ train[\"建物構造\"]\n", "test[\"text\"] = test[\"所在地\"] +\" \"+ test[\"間取り\"] +\" \"+ \\\n", " test[\"築年数\"] +\" \"+ test[\"面積\"] +\" \"+ test[\"建物構造\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "ad4b6910", "metadata": {}, "outputs": [], "source": [ "train[\"tokens\"] = train[\"text\"].apply(lambda x: tokenize(x))\n", "test[\"tokens\"] = test[\"text\"].apply(lambda x: tokenize(x))\n", "train_and_test = pd.concat([train,test],axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "725b1053", "metadata": {}, "outputs": [], "source": [ "bow_model_1 = CountVectorizer(token_pattern='(?u)\\\\b\\\\w+\\\\b', min_df=1)\n", "bow_model_1.fit(train_and_test[\"tokens\"])\n", "vec1 = bow_model_1.transform(train[\"tokens\"])\n", "vec2 = bow_model_1.transform(test[\"tokens\"])\n", "cs_array = pd.DataFrame(np.round(cosine_similarity(vec1, vec2),3))" ] }, { "cell_type": "code", "execution_count": null, "id": "17c91f64", "metadata": {}, "outputs": [], "source": [ "# モデルの生成\n", "vectorizer = TfidfVectorizer(smooth_idf = False)\n", "# TF-IDFの計算\n", "tfidf = vectorizer.fit(train_and_test[\"tokens\"])\n", "vec1 = tfidf.transform(train[\"tokens\"])\n", "vec2 = tfidf.transform(test[\"tokens\"])\n", "cs_array = pd.DataFrame(np.round(cosine_similarity(vec1, vec2),3))" ] }, { "cell_type": "code", "execution_count": null, "id": "f8817cb2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3d4eaadb", "metadata": {}, "outputs": [], "source": [ "cs_array[cs_array[2] == cs_array[2].max()].index" ] }, { "cell_type": "code", "execution_count": null, "id": "f904422b", "metadata": {}, "outputs": [], "source": [ "d = []\n", "for i, max_sim in enumerate(cs_array.max()):\n", "# print(i, max_sim)\n", " if max_sim < 0.95:\n", " continue;\n", " indexs = cs_array[cs_array[i] == max_sim].index\n", " \n", "# print(\"test:\" + test[\"text\"][i])\n", "# print(\"train:\" + train[\"text\"][indexs])\n", "# print(\"-----賃料-----\")\n", "# print(train[\"賃料\"][indexs])\n", "# print(train[\"賃料\"][indexs].mean())\n", " d.append([i, max_sim, test[\"text\"][i], train[\"text\"][indexs],list(train[\"賃料\"][indexs]), train[\"賃料\"][indexs].mean(), list(train[\"賃料\"][indexs])[0]==train[\"賃料\"][indexs].mean()])" ] }, { "cell_type": "code", "execution_count": null, "id": "4b893a9e", "metadata": {}, "outputs": [], "source": [ "submission_convert = pd.DataFrame(d)\n", "submission_convert.columns = [\"test_index\", \"sim_score\", \"test_text\", \"train_text\", \"train_賃料\", \"train_平均賃料\", \"flg\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "8ed51cbe", "metadata": {}, "outputs": [], "source": [ "submission_convert = submission_convert[submission_convert[\"flg\"]==True]" ] }, { "cell_type": "code", "execution_count": null, "id": "83ae5440", "metadata": {}, "outputs": [], "source": [ "submission = pd.read_csv(\"submission_lgbm.csv\", header=None).reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "2960c311", "metadata": {}, "outputs": [], "source": [ "sample = pd.merge(submission, submission_convert, left_on=\"index\", right_on=\"test_index\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "id": "373d7aa2", "metadata": {}, "outputs": [], "source": [ "sample[\"差額\"] = sample[1] - sample[\"train_平均賃料\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "d81b32d8", "metadata": {}, "outputs": [], "source": [ "sample.sort_values(\"差額\")" ] }, { "cell_type": "code", "execution_count": null, "id": "25ee6035", "metadata": {}, "outputs": [], "source": [ "sample[\"賃料\"] = sample['train_平均賃料'].where(sample['train_平均賃料'].isnull()==False, sample[1])\n", "sample = sample[[0, \"賃料\"]]\n", "sample" ] }, { "cell_type": "code", "execution_count": null, "id": "91daf3d8", "metadata": {}, "outputs": [], "source": [ "sample.to_csv(\"submission_lgbm_replace.csv\", sep=\",\", index=False, header=None)" ] }, { "cell_type": "code", "execution_count": null, "id": "b7e3b60c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }