In [None]:
import googlemaps
import pandas as pd
from tqdm import tqdm
import time
import jismesh.utils as ju
import geopandas as gpd
import matplotlib.pyplot as plt

googleapikey = "XXXXX"
gmaps = googlemaps.Client(key=googleapikey)

def geolocation(area):
 gmap_list = gmaps.geocode(area)

 ll = gmap_list[0]["geometry"]["location"]
 return ll["lat"], ll["lng"]

In [None]:
train = pd.read_csv("data/train_processed.csv")
test = pd.read_csv("data/test_processed.csv")

In [None]:
train = pd.read_csv("data/train_processed_add_groupby.csv")
test = pd.read_csv("data/test_processed_add_groupby.csv")

In [None]:
train_and_test = pd.concat([train,test],axis=0)

In [None]:
train_and_test["アクセス"].str.split("\t", expand=True)[:3]

In [None]:
station = train_and_test["アクセス"].str.split("\t", expand=True).iloc[:,1:12:4].fillna("none")
station.columns = ["最寄り駅1", "最寄り駅2", "最寄り駅3"]
# station = pd.concat([train_and_test[['id']],station],axis=1)
station1 = station["最寄り駅1"].to_list()
station2 = station["最寄り駅2"].to_list()
station3 = station["最寄り駅3"].to_list()

# 利用率高い駅も追加
# https://shingakunet.com/area/ranking_station-users/tokyo/
s = ["新宿駅", "渋谷駅", "池袋駅", "北千住駅", "東京駅", "上野駅","品川駅"]

In [None]:
stations = set(station1 + station2 + station3 + s) - set(["none"])

In [None]:
loc_dict = []
for loc in tqdm(stations):
 try:
 lat, lon = geolocation(loc)
 loc_dict.append({'loc': loc, 'lat': lat, 'lng': lon})
 except:
 print(loc)
 time.sleep(0.5)
loc_df = pd.DataFrame(data=loc_dict)
loc_df.to_csv("location_data/station_loc.csv", index=False)

In [None]:
address = list(train_and_test["所在地"].unique())

In [None]:
loc_dict = []
for loc in tqdm(address):
 lat, lon = geolocation(loc)
 loc_dict.append({'loc': loc, 'lat': lat, 'lng': lon})
loc_df = pd.DataFrame(data=loc_dict)
loc_df.to_csv("location_data/address_loc.csv", index=False)

# 距離計算

In [None]:
from pyproj import Geod
obj_altitude = 1000
q = Geod(ellps='WGS84')
station_loc_df = pd.read_csv("location_data/station_loc.csv")
address_loc_df = pd.read_csv("location_data/address_loc.csv")

In [None]:
address_loc_df

In [None]:
s = ["新宿駅", "渋谷駅", "池袋駅", "北千住駅", "東京駅", "上野駅", "品川駅"]
s1 = ["大崎駅",
"五反田駅",
"目黒駅",
"恵比寿駅",
"渋谷駅",
"原宿駅",
"代々木駅",
"新宿駅",
"新大久保駅",
"高田馬場駅",
"目白駅",
"池袋駅",
"大塚駅(東京都)",
"巣鴨駅",
"駒込駅",
"田端駅",
"西日暮里駅",
"日暮里駅",
"鶯谷駅",
"上野駅",
"御徒町駅",
"秋葉原駅",
"神田駅(東京都)",
"東京駅",
"有楽町駅",
"新橋駅",
"浜松町駅",
"田町駅(東京都)",
"高輪ゲートウェイ駅",
"品川駅"
 ]
main_station = station_loc_df[station_loc_df["loc"].isin(s)]
yamanote_station = station_loc_df[station_loc_df["loc"].isin(s1)]

In [None]:
main_station

In [None]:
# 港区は高級住宅があるので、追加しておく
main_station = main_station.append({"loc":"港区", "lat":35.649991, "lng":139.730715}, ignore_index=True)

In [None]:
distance = {}
for index, s in main_station.iterrows():
 fa, ba, d = q.inv(address_loc_df['lng'], address_loc_df['lat'], [s["lng"]] * 17196, [s["lat"]] * 17196)
 distance[f"{s['loc']}_距離"] = d

In [None]:
dist_df = pd.DataFrame(data=distance)
address_loc_df = pd.concat([address_loc_df, dist_df], axis=1)
address_loc_df = address_loc_df.rename(columns={"loc":"所在地"})
address_loc_df

In [None]:
address_loc_df.to_csv("location_data/address_loc_add_distance.csv", index=False)

In [None]:
train_and_test = pd.merge(train_and_test, address_loc_df, on="所在地", how="left")

In [None]:
train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)
test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)
del train_and_test

train.to_csv("data/train_processed_add_geocoding_meshcode_crime.csv", index=False)
test.to_csv("data/test_processed_add_geocoding_meshcode_crime.csv", index=False)

In [None]:
print(train.shape,test.shape)

# メッシュコード、公示価格の追加、方角カラムの修正、山手線平均距離の追加

In [None]:
train = pd.read_csv("data/train_processed_add_geocoding_meshcode_crime.csv")
test = pd.read_csv("data/test_processed_add_geocoding_meshcode_crime.csv")
train_and_test = pd.concat([train,test],axis=0)

In [None]:
meshs = list(train_and_test["meshcode"].unique())

In [None]:
path_shp = "./opendata/L01-19_13.geojson"
gdf = gpd.read_file(path_shp)

In [None]:
columns = {"L01_006":"公示価格", "L01_088":"調査価格_4","L01_089":"調査価格_3", "L01_090":"調査価格_2","L01_091":"調査価格_1"}
c = list(columns.values())
gdf = gdf.rename(columns=columns)[c + ["geometry"]]
gdf[c] = gdf[c].astype(int)

In [None]:
import numpy as np
gdf = gdf.replace(0, np.nan)

In [None]:
gdf['lng'] = gdf.geometry.apply(lambda p: p.x)
gdf['lat'] = gdf.geometry.apply(lambda p: p.y)

In [None]:
gdf_tmp = gdf[c].copy()
gdf_tmp = gdf_tmp.T.interpolate().T
gdf = pd.concat([gdf[["geometry", "lng", "lat"]], gdf_tmp], axis=1)
gdf['meshcode'] = ju.to_meshcode(gdf.lat, gdf.lng, 5)
gdf = gdf[gdf['meshcode'].isin(meshs)]
gdf

In [None]:
from japanmap import get_data, pref_points, pref_names
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
# 表示用のfigure作成
fig, ax = plt.subplots(1, 1, figsize=(30, 30))
# 日本地図のポリゴンデータ作成しGeoDataFrameに格納
pref_poly = [Polygon(points) for points in pref_points(get_data())]
gdf_pref = gpd.GeoDataFrame(crs = 'epsg:4612', geometry=pref_poly)
gdf_pref['prefecture'] = pref_names[1:] # 県名を格納
# 滋賀県に絞る
gdf_pref = gdf_pref[gdf_pref['prefecture'] == '東京都']

# 日本地図をプロット
gdf_pref.plot(ax = ax,
 color = 'gray' # 塗りつぶし色を指定
 )
# ポイントをプロット
gdf.plot(ax = ax, # 描画先のax
 column = '公示価格', # 色分け対象の列
 cmap = 'OrRd', # 色分けのカラーマップ
 legend = True, # 色分けのカラーバー表示
 legend_kwds = {'label': 'dam height', # カラーバーにラベル設定
 'shrink': 0.6}, # カラーバーが長すぎるので短く
 s = 6 # 点マーカーのサイズ
 )

In [None]:
for index, s in train_and_test.iterrows():
 print(s["lng"], gdf['lng'])
 break

In [None]:
distance = {}
for index, s in train_and_test.iterrows():
 fa, ba, d = q.inv(gdf['lng'], gdf['lat'], [s["lng"]] * 1325, [s["lat"]] * 1325)
 distance[f"{s['id']}"] = d

In [None]:
dist_df = pd.DataFrame(data=distance)

In [None]:
sample = pd.DataFrame(dist_df.T.idxmin(axis=1))
gdf_tmp = gdf.reset_index().drop("index", axis=1).reset_index()
kakaku = pd.merge(sample, gdf_tmp, right_on="index", left_on=0, how="left").drop([0, "index", "geometry", "meshcode", "lng", "lat"], axis=1)
train_and_test = pd.concat([train_and_test.reset_index(), kakaku],axis=1).drop(["index"], axis=1)

In [None]:
train_and_test

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train_and_test_tmp = pd.concat([train,test],axis=0)

In [None]:
size_mapping = {'北': 0, '北⻄': 1, '北東': 2, '西': 3, '東': 4, '南⻄': 5, '南東': 6, '南': 7}
train_and_test_tmp['方角'] = train_and_test_tmp['方角'].map(size_mapping)
train_and_test_tmp = train_and_test_tmp.reset_index()
train_and_test["方角"] = train_and_test_tmp['方角']

In [None]:
train_and_test

In [None]:
from pyproj import Geod
obj_altitude = 1000
q = Geod(ellps='WGS84')

station_loc_df = pd.read_csv("location_data/station_loc.csv")
address_loc_df = pd.read_csv("location_data/address_loc.csv")

s1 = ["大崎駅",
"五反田駅",
"目黒駅",
"恵比寿駅",
"渋谷駅",
"原宿駅",
"代々木駅",
"新宿駅",
"新大久保駅",
"高田馬場駅",
"目白駅",
"池袋駅",
"大塚駅(東京都)",
"巣鴨駅",
"駒込駅",
"田端駅",
"西日暮里駅",
"日暮里駅",
"鶯谷駅",
"上野駅",
"御徒町駅",
"秋葉原駅",
"神田駅(東京都)",
"東京駅",
"有楽町駅",
"新橋駅",
"浜松町駅",
"田町駅(東京都)",
"高輪ゲートウェイ駅",
"品川駅"
 ]
yamanote_station = station_loc_df[station_loc_df["loc"].isin(s1)]



distance_yamanote = {}
for index, s in yamanote_station.iterrows():
 fa, ba, d = q.inv(address_loc_df['lng'], address_loc_df['lat'], [s["lng"]] * 17196, [s["lat"]] * 17196)
 distance_yamanote[f"{s['loc']}_距離"] = d

dist_df = pd.DataFrame(data=distance_yamanote).mean(axis=1)
address_loc_df = pd.concat([address_loc_df, dist_df], axis=1).drop(["lat", "lng"], axis=1)
address_loc_df = address_loc_df.rename(columns={"loc":"所在地", 0:"山手線平均距離"})
train_and_test = pd.merge(train_and_test, address_loc_df, on="所在地", how="left")

train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)
test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)
del train_and_test

train.to_csv("data/train_processed_add_geocoding_meshcode_crime_landprice.csv", index=False)
test.to_csv("data/test_processed_add_geocoding_meshcode_crime_landprice.csv", index=False)

# ランドマークまでの距離、最寄り駅の利用者、最寄り駅までの距離を追加

In [None]:
train = pd.read_csv("data/train_processed_add_geocoding_meshcode_crime_landprice_log_groupby.csv")
test = pd.read_csv("data/test_processed_add_geocoding_meshcode_crime_landprice_log_groupby.csv")
train_and_test = pd.concat([train,test],axis=0)

In [None]:
train_and_test.shape

In [None]:
station = train_and_test["アクセス"].str.split("\t", expand=True).iloc[:,1:12:4].fillna("none")
station.columns = ["最寄り駅1", "最寄り駅2", "最寄り駅3"]
train_and_test["駅名"] = station["最寄り駅1"]

In [None]:
train_and_test["駅名"] = train_and_test["駅名"].str.replace("ケ", "ヶ")
train_and_test["駅名"] = train_and_test["駅名"].str.replace("\(.*\)", "")

In [None]:
landmark = ["東京スカイツリー", "東京タワー", "雷門", "表参道ヒルズ", "渋谷スクランブルスクエア", "東京ドーム", "銀座", "池袋サンシャインシティー", "日本武道館", "皇居", "赤坂サカス", "虎ノ門ヒルズ", "調布パルコ"]
station_people = "./opendata/交通流動量.geojson"
gdf = gpd.read_file(station_people)
# 最寄り駅までの距離

In [None]:
columns = {"S12_001":"駅名", "S12_041":"乗降客数2019","S12_037":"乗降客数2018"}
c = list(columns.values())
gdf = gdf.rename(columns=columns)[c]

In [None]:
gdf["駅名"] = gdf["駅名"]+"駅"
gdf["駅名"] = gdf["駅名"].str.replace("\(.*\)", "")
gdf = gdf.groupby("駅名").mean().reset_index()

In [None]:
train_and_test = pd.merge(train_and_test, gdf, on="駅名", how="inner")

In [None]:
train_and_test.isnull().sum()

In [None]:
train_and_test.shape

In [None]:
loc_dict = []
for loc in tqdm(landmark):
 lat, lon = geolocation(loc)
 loc_dict.append({'loc': loc, 'lat': lat, 'lng': lon})
loc_df = pd.DataFrame(data=loc_dict)

In [None]:
loc_df

In [None]:
from pyproj import Geod
obj_altitude = 1000
q = Geod(ellps='WGS84')
station_loc_df = pd.read_csv("location_data/station_loc.csv")
address_loc_df = pd.read_csv("location_data/address_loc.csv")

In [None]:
distance = {}
for index, s in loc_df.iterrows():
 fa, ba, d = q.inv(address_loc_df['lng'], address_loc_df['lat'], [s["lng"]] * 17196, [s["lat"]] * 17196)
 distance[f"{s['loc']}_距離"] = d

In [None]:
dist_df = pd.DataFrame(data=distance)

In [None]:
dist_df

In [None]:
dist_df = pd.DataFrame(data=distance)
address_loc_df = pd.concat([address_loc_df, dist_df], axis=1).drop(["lat", "lng"], axis=1)
address_loc_df = address_loc_df.rename(columns={"loc":"所在地"})

In [None]:
train_and_test = pd.merge(train_and_test, address_loc_df, on="所在地", how="left")

In [None]:
train_and_test.isnull().sum()

In [None]:
train_and_test.shape

In [None]:
sample = train_and_test[["id","所在地","駅名"]]
address_loc_df = pd.read_csv("location_data/address_loc.csv")

station_loc_df["loc"] = station_loc_df["loc"].str.replace("ケ", "ヶ")
station_loc_df["loc"] = station_loc_df["loc"].str.replace("\(.*\)", "")
station_loc_df.append({'loc': "西日暮里駅", 'lat': 35.73254311759141, 'lng': 139.76781815554125}, ignore_index=True)
station_loc_df.append({'loc': "日暮里駅", 'lat': 35.72835810149057, 'lng': 139.77064139815946}, ignore_index=True)

In [None]:
sample = pd.merge(sample, station_loc_df, left_on="駅名", right_on="loc", how="inner")
sample = pd.merge(sample, address_loc_df, left_on="所在地", right_on="loc", how="inner")

In [None]:
sample = sample.drop_duplicates("id")

In [None]:
distance = []
for index, s in sample.iterrows():
 fa, ba, d = q.inv(s['lng_x'], s['lat_x'], s["lng_y"], s["lat_y"])
 distance.append(d)

In [None]:
len(sample), len(distance)

In [None]:
dist_df = pd.DataFrame(data=distance)
sample = pd.concat([sample.reset_index(), dist_df], axis=1).rename(columns={0:"最寄り駅までの距離"})

In [None]:
sample = sample[["id", "最寄り駅までの距離"]]

In [None]:
sample.drop_duplicates()

In [None]:
train_and_test = pd.merge(train_and_test, sample, on="id", how="inner")

In [None]:
train_and_test.isnull().sum()

In [None]:
train_and_test.shape

In [None]:
labels, uniques = pd.factorize(train_and_test["駅名"])
train_and_test["駅名"] = labels
train_and_test

In [None]:
train = train_and_test[train_and_test['賃料'].notnull()].copy().reset_index(drop=True)
test = train_and_test[train_and_test['賃料'].isnull()].copy().reset_index(drop=True)
# del train_and_test

train.to_csv("data/train_processed_add_geocoding_meshcode_crime_landprice_ladmark_log_groupby.csv", index=False)
test.to_csv("data/test_processed_add_geocoding_meshcode_crime_landprice_ladmark_log_groupby.csv", index=False)

In [None]:
train.shape, test.shape