{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "370b8abb", "metadata": {}, "outputs": [], "source": [ "import googlemaps\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import time\n", "import jismesh.utils as ju\n", "import geopandas as gpd\n", "import matplotlib.pyplot as plt\n", "\n", "googleapikey = \"XXXXX\"\n", "gmaps = googlemaps.Client(key=googleapikey)\n", "\n", "def geolocation(area):\n", " gmap_list = gmaps.geocode(area)\n", "\n", " ll = gmap_list[0][\"geometry\"][\"location\"]\n", " return ll[\"lat\"], ll[\"lng\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "0bb1a362", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train_processed.csv\")\n", "test = pd.read_csv(\"data/test_processed.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "079ee0b7", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train_processed_add_groupby.csv\")\n", "test = pd.read_csv(\"data/test_processed_add_groupby.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "38995299", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.concat([train,test],axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "2acd0ba7", "metadata": {}, "outputs": [], "source": [ "train_and_test[\"アクセス\"].str.split(\"\\t\", expand=True)[:3]" ] }, { "cell_type": "code", "execution_count": null, "id": "d42b9cb4", "metadata": {}, "outputs": [], "source": [ "station = train_and_test[\"アクセス\"].str.split(\"\\t\", expand=True).iloc[:,1:12:4].fillna(\"none\")\n", "station.columns = [\"最寄り駅1\", \"最寄り駅2\", \"最寄り駅3\"]\n", "# station = pd.concat([train_and_test[['id']],station],axis=1)\n", "station1 = station[\"最寄り駅1\"].to_list()\n", "station2 = station[\"最寄り駅2\"].to_list()\n", "station3 = station[\"最寄り駅3\"].to_list()\n", "\n", "# 利用率高い駅も追加\n", "# https://shingakunet.com/area/ranking_station-users/tokyo/\n", "s = [\"新宿駅\", \"渋谷駅\", \"池袋駅\", \"北千住駅\", \"東京駅\", \"上野駅\",\"品川駅\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "8f34ad10", "metadata": {}, "outputs": [], "source": [ "stations = set(station1 + station2 + station3 + s) - set([\"none\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "697175d7", "metadata": {}, "outputs": [], "source": [ "loc_dict = []\n", "for loc in tqdm(stations):\n", " try:\n", " lat, lon = geolocation(loc)\n", " loc_dict.append({'loc': loc, 'lat': lat, 'lng': lon})\n", " except:\n", " print(loc)\n", " time.sleep(0.5)\n", "loc_df = pd.DataFrame(data=loc_dict)\n", "loc_df.to_csv(\"location_data/station_loc.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "3be894ce", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b901e229", "metadata": {}, "outputs": [], "source": [ "address = list(train_and_test[\"所在地\"].unique())" ] }, { "cell_type": "code", "execution_count": null, "id": "ccaba91e", "metadata": {}, "outputs": [], "source": [ "loc_dict = []\n", "for loc in tqdm(address):\n", " lat, lon = geolocation(loc)\n", " loc_dict.append({'loc': loc, 'lat': lat, 'lng': lon})\n", "loc_df = pd.DataFrame(data=loc_dict)\n", "loc_df.to_csv(\"location_data/address_loc.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "604c8ba3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e594ee78", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "b7827b41", "metadata": {}, "source": [ "# 距離計算" ] }, { "cell_type": "code", "execution_count": null, "id": "2c9c3907", "metadata": {}, "outputs": [], "source": [ "from pyproj import Geod\n", "obj_altitude = 1000\n", "q = Geod(ellps='WGS84')\n", "station_loc_df = pd.read_csv(\"location_data/station_loc.csv\")\n", "address_loc_df = pd.read_csv(\"location_data/address_loc.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "18682b7f", "metadata": {}, "outputs": [], "source": [ "address_loc_df" ] }, { "cell_type": "code", "execution_count": null, "id": "a269c4fc", "metadata": {}, "outputs": [], "source": [ "s = [\"新宿駅\", \"渋谷駅\", \"池袋駅\", \"北千住駅\", \"東京駅\", \"上野駅\", \"品川駅\"]\n", "s1 = [\"大崎駅\",\n", "\"五反田駅\",\n", "\"目黒駅\",\n", "\"恵比寿駅\",\n", "\"渋谷駅\",\n", "\"原宿駅\",\n", "\"代々木駅\",\n", "\"新宿駅\",\n", "\"新大久保駅\",\n", "\"高田馬場駅\",\n", "\"目白駅\",\n", "\"池袋駅\",\n", "\"大塚駅(東京都)\",\n", "\"巣鴨駅\",\n", "\"駒込駅\",\n", "\"田端駅\",\n", "\"西日暮里駅\",\n", "\"日暮里駅\",\n", "\"鶯谷駅\",\n", "\"上野駅\",\n", "\"御徒町駅\",\n", "\"秋葉原駅\",\n", "\"神田駅(東京都)\",\n", "\"東京駅\",\n", "\"有楽町駅\",\n", "\"新橋駅\",\n", "\"浜松町駅\",\n", "\"田町駅(東京都)\",\n", "\"高輪ゲートウェイ駅\",\n", "\"品川駅\"\n", " ]\n", "main_station = station_loc_df[station_loc_df[\"loc\"].isin(s)]\n", "yamanote_station = station_loc_df[station_loc_df[\"loc\"].isin(s1)]" ] }, { "cell_type": "code", "execution_count": null, "id": "ed5334c5", "metadata": {}, "outputs": [], "source": [ "main_station" ] }, { "cell_type": "code", "execution_count": null, "id": "90fcb0e2", "metadata": {}, "outputs": [], "source": [ "# 港区は高級住宅があるので、追加しておく\n", "main_station = main_station.append({\"loc\":\"港区\", \"lat\":35.649991, \"lng\":139.730715}, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "9fb9caf0", "metadata": {}, "outputs": [], "source": [ "distance = {}\n", "for index, s in main_station.iterrows():\n", " fa, ba, d = q.inv(address_loc_df['lng'], address_loc_df['lat'], [s[\"lng\"]] * 17196, [s[\"lat\"]] * 17196)\n", " distance[f\"{s['loc']}_距離\"] = d" ] }, { "cell_type": "code", "execution_count": null, "id": "b8987b6d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b5a23604", "metadata": {}, "outputs": [], "source": [ "dist_df = pd.DataFrame(data=distance)\n", "address_loc_df = pd.concat([address_loc_df, dist_df], axis=1)\n", "address_loc_df = address_loc_df.rename(columns={\"loc\":\"所在地\"})\n", "address_loc_df" ] }, { "cell_type": "code", "execution_count": null, "id": "d23ceaa1", "metadata": {}, "outputs": [], "source": [ "address_loc_df.to_csv(\"location_data/address_loc_add_distance.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "e4333fe7", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.merge(train_and_test, address_loc_df, on=\"所在地\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cbfe8009", "metadata": {}, "outputs": [], "source": [ "train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)\n", "test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)\n", "del train_and_test\n", "\n", "train.to_csv(\"data/train_processed_add_geocoding_meshcode_crime.csv\", index=False)\n", "test.to_csv(\"data/test_processed_add_geocoding_meshcode_crime.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "11852993", "metadata": {}, "outputs": [], "source": [ "print(train.shape,test.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "38ad49f3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7fb040ed", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "014f8ad1", "metadata": {}, "source": [ "# メッシュコード、公示価格の追加、方角カラムの修正、山手線平均距離の追加" ] }, { "cell_type": "code", "execution_count": null, "id": "d31a0a58", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train_processed_add_geocoding_meshcode_crime.csv\")\n", "test = pd.read_csv(\"data/test_processed_add_geocoding_meshcode_crime.csv\")\n", "train_and_test = pd.concat([train,test],axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "727e949a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f7d088f3", "metadata": {}, "outputs": [], "source": [ "meshs = list(train_and_test[\"meshcode\"].unique())" ] }, { "cell_type": "code", "execution_count": null, "id": "e6624b0e", "metadata": {}, "outputs": [], "source": [ "path_shp = \"./opendata/L01-19_13.geojson\"\n", "gdf = gpd.read_file(path_shp)" ] }, { "cell_type": "code", "execution_count": null, "id": "601cc7ee", "metadata": {}, "outputs": [], "source": [ "columns = {\"L01_006\":\"公示価格\", \"L01_088\":\"調査価格_4\",\"L01_089\":\"調査価格_3\", \"L01_090\":\"調査価格_2\",\"L01_091\":\"調査価格_1\"}\n", "c = list(columns.values())\n", "gdf = gdf.rename(columns=columns)[c + [\"geometry\"]]\n", "gdf[c] = gdf[c].astype(int)" ] }, { "cell_type": "code", "execution_count": null, "id": "fbd35402", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "gdf = gdf.replace(0, np.nan)" ] }, { "cell_type": "code", "execution_count": null, "id": "ef19f9cc", "metadata": {}, "outputs": [], "source": [ "gdf['lng'] = gdf.geometry.apply(lambda p: p.x)\n", "gdf['lat'] = gdf.geometry.apply(lambda p: p.y)" ] }, { "cell_type": "code", "execution_count": null, "id": "251459bd", "metadata": {}, "outputs": [], "source": [ "gdf_tmp = gdf[c].copy()\n", "gdf_tmp = gdf_tmp.T.interpolate().T\n", "gdf = pd.concat([gdf[[\"geometry\", \"lng\", \"lat\"]], gdf_tmp], axis=1)\n", "gdf['meshcode'] = ju.to_meshcode(gdf.lat, gdf.lng, 5)\n", "gdf = gdf[gdf['meshcode'].isin(meshs)]\n", "gdf" ] }, { "cell_type": "code", "execution_count": null, "id": "bd9272ee", "metadata": {}, "outputs": [], "source": [ "from japanmap import get_data, pref_points, pref_names\n", "from shapely.geometry import Polygon\n", "import matplotlib.pyplot as plt\n", "# 表示用のfigure作成\n", "fig, ax = plt.subplots(1, 1, figsize=(30, 30))\n", "# 日本地図のポリゴンデータ作成しGeoDataFrameに格納\n", "pref_poly = [Polygon(points) for points in pref_points(get_data())]\n", "gdf_pref = gpd.GeoDataFrame(crs = 'epsg:4612', geometry=pref_poly)\n", "gdf_pref['prefecture'] = pref_names[1:] # 県名を格納\n", "# 滋賀県に絞る\n", "gdf_pref = gdf_pref[gdf_pref['prefecture'] == '東京都']\n", "\n", "# 日本地図をプロット\n", "gdf_pref.plot(ax = ax,\n", " color = 'gray' # 塗りつぶし色を指定\n", " )\n", "# ポイントをプロット\n", "gdf.plot(ax = ax, # 描画先のax\n", " column = '公示価格', # 色分け対象の列\n", " cmap = 'OrRd', # 色分けのカラーマップ\n", " legend = True, # 色分けのカラーバー表示\n", " legend_kwds = {'label': 'dam height', # カラーバーにラベル設定\n", " 'shrink': 0.6}, # カラーバーが長すぎるので短く\n", " s = 6 # 点マーカーのサイズ\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "37c2900c", "metadata": {}, "outputs": [], "source": [ "for index, s in train_and_test.iterrows():\n", " print(s[\"lng\"], gdf['lng'])\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "d066d159", "metadata": {}, "outputs": [], "source": [ "distance = {}\n", "for index, s in train_and_test.iterrows():\n", " fa, ba, d = q.inv(gdf['lng'], gdf['lat'], [s[\"lng\"]] * 1325, [s[\"lat\"]] * 1325)\n", " distance[f\"{s['id']}\"] = d" ] }, { "cell_type": "code", "execution_count": null, "id": "1a6b6444", "metadata": {}, "outputs": [], "source": [ "dist_df = pd.DataFrame(data=distance)" ] }, { "cell_type": "code", "execution_count": null, "id": "112d558a", "metadata": {}, "outputs": [], "source": [ "sample = pd.DataFrame(dist_df.T.idxmin(axis=1))\n", "gdf_tmp = gdf.reset_index().drop(\"index\", axis=1).reset_index()\n", "kakaku = pd.merge(sample, gdf_tmp, right_on=\"index\", left_on=0, how=\"left\").drop([0, \"index\", \"geometry\", \"meshcode\", \"lng\", \"lat\"], axis=1)\n", "train_and_test = pd.concat([train_and_test.reset_index(), kakaku],axis=1).drop([\"index\"], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "282d30d4", "metadata": {}, "outputs": [], "source": [ "train_and_test" ] }, { "cell_type": "code", "execution_count": null, "id": "c4df4d0c", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train.csv\")\n", "test = pd.read_csv(\"data/test.csv\")\n", "train_and_test_tmp = pd.concat([train,test],axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "f0acefd4", "metadata": {}, "outputs": [], "source": [ "size_mapping = {'北': 0, '北⻄': 1, '北東': 2, '西': 3, '東': 4, '南⻄': 5, '南東': 6, '南': 7}\n", "train_and_test_tmp['方角'] = train_and_test_tmp['方角'].map(size_mapping)\n", "train_and_test_tmp = train_and_test_tmp.reset_index()\n", "train_and_test[\"方角\"] = train_and_test_tmp['方角']" ] }, { "cell_type": "code", "execution_count": null, "id": "bf9a575e", "metadata": {}, "outputs": [], "source": [ "train_and_test" ] }, { "cell_type": "code", "execution_count": null, "id": "35c35cbd", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2deb9b69", "metadata": {}, "outputs": [], "source": [ "from pyproj import Geod\n", "obj_altitude = 1000\n", "q = Geod(ellps='WGS84')\n", "\n", "station_loc_df = pd.read_csv(\"location_data/station_loc.csv\")\n", "address_loc_df = pd.read_csv(\"location_data/address_loc.csv\")\n", "\n", "s1 = [\"大崎駅\",\n", "\"五反田駅\",\n", "\"目黒駅\",\n", "\"恵比寿駅\",\n", "\"渋谷駅\",\n", "\"原宿駅\",\n", "\"代々木駅\",\n", "\"新宿駅\",\n", "\"新大久保駅\",\n", "\"高田馬場駅\",\n", "\"目白駅\",\n", "\"池袋駅\",\n", "\"大塚駅(東京都)\",\n", "\"巣鴨駅\",\n", "\"駒込駅\",\n", "\"田端駅\",\n", "\"西日暮里駅\",\n", "\"日暮里駅\",\n", "\"鶯谷駅\",\n", "\"上野駅\",\n", "\"御徒町駅\",\n", "\"秋葉原駅\",\n", "\"神田駅(東京都)\",\n", "\"東京駅\",\n", "\"有楽町駅\",\n", "\"新橋駅\",\n", "\"浜松町駅\",\n", "\"田町駅(東京都)\",\n", "\"高輪ゲートウェイ駅\",\n", "\"品川駅\"\n", " ]\n", "yamanote_station = station_loc_df[station_loc_df[\"loc\"].isin(s1)]\n", "\n", "\n", "\n", "distance_yamanote = {}\n", "for index, s in yamanote_station.iterrows():\n", " fa, ba, d = q.inv(address_loc_df['lng'], address_loc_df['lat'], [s[\"lng\"]] * 17196, [s[\"lat\"]] * 17196)\n", " distance_yamanote[f\"{s['loc']}_距離\"] = d\n", "\n", "dist_df = pd.DataFrame(data=distance_yamanote).mean(axis=1)\n", "address_loc_df = pd.concat([address_loc_df, dist_df], axis=1).drop([\"lat\", \"lng\"], axis=1)\n", "address_loc_df = address_loc_df.rename(columns={\"loc\":\"所在地\", 0:\"山手線平均距離\"})\n", "train_and_test = pd.merge(train_and_test, address_loc_df, on=\"所在地\", how=\"left\")\n", "\n", "train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)\n", "test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)\n", "del train_and_test\n", "\n", "train.to_csv(\"data/train_processed_add_geocoding_meshcode_crime_landprice.csv\", index=False)\n", "test.to_csv(\"data/test_processed_add_geocoding_meshcode_crime_landprice.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "e2721634", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "17124bf1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "ff125266", "metadata": {}, "source": [ "# ランドマークまでの距離、最寄り駅の利用者、最寄り駅までの距離を追加" ] }, { "cell_type": "code", "execution_count": null, "id": "acb171ff", "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"data/train_processed_add_geocoding_meshcode_crime_landprice_log_groupby.csv\")\n", "test = pd.read_csv(\"data/test_processed_add_geocoding_meshcode_crime_landprice_log_groupby.csv\")\n", "train_and_test = pd.concat([train,test],axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "02a5fa39", "metadata": {}, "outputs": [], "source": [ "train_and_test.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "90d0ba3c", "metadata": {}, "outputs": [], "source": [ "station = train_and_test[\"アクセス\"].str.split(\"\\t\", expand=True).iloc[:,1:12:4].fillna(\"none\")\n", "station.columns = [\"最寄り駅1\", \"最寄り駅2\", \"最寄り駅3\"]\n", "train_and_test[\"駅名\"] = station[\"最寄り駅1\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "df6a99d8", "metadata": {}, "outputs": [], "source": [ "train_and_test[\"駅名\"] = train_and_test[\"駅名\"].str.replace(\"ケ\", \"ヶ\")\n", "train_and_test[\"駅名\"] = train_and_test[\"駅名\"].str.replace(\"\\(.*\\)\", \"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d3403641", "metadata": {}, "outputs": [], "source": [ "landmark = [\"東京スカイツリー\", \"東京タワー\", \"雷門\", \"表参道ヒルズ\", \"渋谷スクランブルスクエア\", \"東京ドーム\", \"銀座\", \"池袋サンシャインシティー\", \"日本武道館\", \"皇居\", \"赤坂サカス\", \"虎ノ門ヒルズ\", \"調布パルコ\"]\n", "station_people = \"./opendata/交通流動量.geojson\"\n", "gdf = gpd.read_file(station_people)\n", "# 最寄り駅までの距離" ] }, { "cell_type": "code", "execution_count": null, "id": "a7ad1dcf", "metadata": {}, "outputs": [], "source": [ "columns = {\"S12_001\":\"駅名\", \"S12_041\":\"乗降客数2019\",\"S12_037\":\"乗降客数2018\"}\n", "c = list(columns.values())\n", "gdf = gdf.rename(columns=columns)[c]" ] }, { "cell_type": "code", "execution_count": null, "id": "f760bc89", "metadata": {}, "outputs": [], "source": [ "gdf[\"駅名\"] = gdf[\"駅名\"]+\"駅\"\n", "gdf[\"駅名\"] = gdf[\"駅名\"].str.replace(\"\\(.*\\)\", \"\")\n", "gdf = gdf.groupby(\"駅名\").mean().reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "0c050c5f", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.merge(train_and_test, gdf, on=\"駅名\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0b7e7219", "metadata": {}, "outputs": [], "source": [ "train_and_test.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "d2504a61", "metadata": {}, "outputs": [], "source": [ "train_and_test.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "2eacab34", "metadata": {}, "outputs": [], "source": [ "loc_dict = []\n", "for loc in tqdm(landmark):\n", " lat, lon = geolocation(loc)\n", " loc_dict.append({'loc': loc, 'lat': lat, 'lng': lon})\n", "loc_df = pd.DataFrame(data=loc_dict)" ] }, { "cell_type": "code", "execution_count": null, "id": "4ab31bbe", "metadata": {}, "outputs": [], "source": [ "loc_df" ] }, { "cell_type": "code", "execution_count": null, "id": "c4b83997", "metadata": {}, "outputs": [], "source": [ "from pyproj import Geod\n", "obj_altitude = 1000\n", "q = Geod(ellps='WGS84')\n", "station_loc_df = pd.read_csv(\"location_data/station_loc.csv\")\n", "address_loc_df = pd.read_csv(\"location_data/address_loc.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b72075c6", "metadata": {}, "outputs": [], "source": [ "distance = {}\n", "for index, s in loc_df.iterrows():\n", " fa, ba, d = q.inv(address_loc_df['lng'], address_loc_df['lat'], [s[\"lng\"]] * 17196, [s[\"lat\"]] * 17196)\n", " distance[f\"{s['loc']}_距離\"] = d" ] }, { "cell_type": "code", "execution_count": null, "id": "179c3d6d", "metadata": {}, "outputs": [], "source": [ "dist_df = pd.DataFrame(data=distance)" ] }, { "cell_type": "code", "execution_count": null, "id": "099b2afe", "metadata": {}, "outputs": [], "source": [ "dist_df" ] }, { "cell_type": "code", "execution_count": null, "id": "1e3c25e0", "metadata": {}, "outputs": [], "source": [ "dist_df = pd.DataFrame(data=distance)\n", "address_loc_df = pd.concat([address_loc_df, dist_df], axis=1).drop([\"lat\", \"lng\"], axis=1)\n", "address_loc_df = address_loc_df.rename(columns={\"loc\":\"所在地\"})" ] }, { "cell_type": "code", "execution_count": null, "id": "71ecd0c2", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.merge(train_and_test, address_loc_df, on=\"所在地\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4a0f6f6b", "metadata": {}, "outputs": [], "source": [ "train_and_test.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "319d6977", "metadata": {}, "outputs": [], "source": [ "train_and_test.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "59b9c732", "metadata": {}, "outputs": [], "source": [ "sample = train_and_test[[\"id\",\"所在地\",\"駅名\"]]\n", "address_loc_df = pd.read_csv(\"location_data/address_loc.csv\")\n", "\n", "station_loc_df[\"loc\"] = station_loc_df[\"loc\"].str.replace(\"ケ\", \"ヶ\")\n", "station_loc_df[\"loc\"] = station_loc_df[\"loc\"].str.replace(\"\\(.*\\)\", \"\")\n", "station_loc_df.append({'loc': \"西日暮里駅\", 'lat': 35.73254311759141, 'lng': 139.76781815554125}, ignore_index=True)\n", "station_loc_df.append({'loc': \"日暮里駅\", 'lat': 35.72835810149057, 'lng': 139.77064139815946}, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "db2aca31", "metadata": {}, "outputs": [], "source": [ "sample = pd.merge(sample, station_loc_df, left_on=\"駅名\", right_on=\"loc\", how=\"inner\")\n", "sample = pd.merge(sample, address_loc_df, left_on=\"所在地\", right_on=\"loc\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e38ea58c", "metadata": {}, "outputs": [], "source": [ "sample = sample.drop_duplicates(\"id\")" ] }, { "cell_type": "code", "execution_count": null, "id": "470ce84e", "metadata": {}, "outputs": [], "source": [ "distance = []\n", "for index, s in sample.iterrows():\n", " fa, ba, d = q.inv(s['lng_x'], s['lat_x'], s[\"lng_y\"], s[\"lat_y\"])\n", " distance.append(d)" ] }, { "cell_type": "code", "execution_count": null, "id": "56e91701", "metadata": {}, "outputs": [], "source": [ "len(sample), len(distance)" ] }, { "cell_type": "code", "execution_count": null, "id": "748417a6", "metadata": {}, "outputs": [], "source": [ "dist_df = pd.DataFrame(data=distance)\n", "sample = pd.concat([sample.reset_index(), dist_df], axis=1).rename(columns={0:\"最寄り駅までの距離\"})" ] }, { "cell_type": "code", "execution_count": null, "id": "b5e2d674", "metadata": {}, "outputs": [], "source": [ "sample = sample[[\"id\", \"最寄り駅までの距離\"]]" ] }, { "cell_type": "code", "execution_count": null, "id": "79489f21", "metadata": {}, "outputs": [], "source": [ "sample.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "id": "6e9f817f", "metadata": {}, "outputs": [], "source": [ "train_and_test = pd.merge(train_and_test, sample, on=\"id\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": null, "id": "706fa458", "metadata": {}, "outputs": [], "source": [ "train_and_test.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "4eeb699a", "metadata": {}, "outputs": [], "source": [ "train_and_test.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "7747de69", "metadata": {}, "outputs": [], "source": [ "labels, uniques = pd.factorize(train_and_test[\"駅名\"])\n", "train_and_test[\"駅名\"] = labels\n", "train_and_test" ] }, { "cell_type": "code", "execution_count": null, "id": "d85c3594", "metadata": {}, "outputs": [], "source": [ "train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)\n", "test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)\n", "# del train_and_test\n", "\n", "train.to_csv(\"data/train_processed_add_geocoding_meshcode_crime_landprice_ladmark_log_groupby.csv\", index=False)\n", "test.to_csv(\"data/test_processed_add_geocoding_meshcode_crime_landprice_ladmark_log_groupby.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "40c4bc8f", "metadata": {}, "outputs": [], "source": [ "train.shape, test.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "58a57412", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3426b489", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }