{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# **식약처 레시피 데이터 메뉴명과 연동**\n", "# **1 농축산식품 레시피 수집기**\n", "## **01 농축산식품 API 및 CSV 전처리**\n", "**[레시피 재료정보 API](http://data.mafra.go.kr/opendata/data/indexOpenDataDetail.do?data_id=20150827000000000465&filter_ty=O&getBack=G&sort_id=&s_data_nm=&instt_id=&cl_code=&shareYn=)**\n", "1. **[glob 모듈에서 파일목록 sorting](https://redcarrot.tistory.com/222)**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./data/레시피_메뉴정보_20191220.csv ./data/레시피_조리법_20191220.csv\n", "./data/레시피_재료정보_20191220.csv ./data/레시피_조리법_원본_20191220.csv\n" ] } ], "source": [ "ls ./data/레시피*.*" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Grid_20150827000000000227_1': {'totalCnt': 6104,\n", " 'startRow': 1,\n", " 'endRow': 2,\n", " 'result': {'code': 'INFO-000', 'message': '정상 처리되었습니다.'},\n", " 'row': [{'ROW_NUM': 1,\n", " 'RECIPE_ID': 1,\n", " 'IRDNT_SN': 1,\n", " 'IRDNT_NM': '쌀',\n", " 'IRDNT_CPCTY': '4컵',\n", " 'IRDNT_TY_CODE': '3060001',\n", " 'IRDNT_TY_NM': '주재료'},\n", " {'ROW_NUM': 2,\n", " 'RECIPE_ID': 1,\n", " 'IRDNT_SN': 2,\n", " 'IRDNT_NM': '안심',\n", " 'IRDNT_CPCTY': '200g',\n", " 'IRDNT_TY_CODE': '3060001',\n", " 'IRDNT_TY_NM': '주재료'}]}}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# API 를 활용한 데이터 호출\n", "url = \"http://211.237.50.150:7080/openapi/{API_KEY}/json/{OpenAPI 서비스 URL}/{START_INDEX}/{END_INDEX}\"\n", "apiKey = 'e5e7753740311d44c7c15c7b5744f5723109b9f5c46eedc47fa7e183ceee22fa'\n", "url = \"http://211.237.50.150:7080/openapi/\"+apiKey +\"/json/Grid_20150827000000000227_1/1/2\"\n", "url_sample = \"http://211.237.50.150:7080/openapi/sample/json/Grid_20150827000000000227_1/1/2\"\n", "from urllib import parse, request\n", "import json\n", "json.loads(request.urlopen(url).read().decode('utf-8'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **02 농축산식품 API 및 CSV 전처리**\n", "CSV 다운파일 중 전처리 작업의 진행" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
레시피 코드요리설명순서요리설명과정 이미지 URL과정팁
01954535모양을 낸 반죽에 밀가루를 묻히고, 계란을 푼 계란물을 묻혀준다.http://file.okdab.com/recipe/14829957727100013...\\n
11954534동그랑땡 반죽을 동글납작하게 모양을 잡아준다.http://file.okdab.com/recipe/14829957727060013...\\n
\n", "
" ], "text/plain": [ " 레시피 코드 요리설명순서 요리설명 \\\n", "0 195453 5 모양을 낸 반죽에 밀가루를 묻히고, 계란을 푼 계란물을 묻혀준다. \n", "1 195453 4 동그랑땡 반죽을 동글납작하게 모양을 잡아준다. \n", "\n", " 과정 이미지 URL 과정팁 \n", "0 http://file.okdab.com/recipe/14829957727100013... \\n \n", "1 http://file.okdab.com/recipe/14829957727060013... \\n " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 조리법 데이터 전처리 재작업 (내부 \",\" 와 구분자 \",\" 를 혼용)\n", "with open(\"data/레시피_조리법_원본_20191220.csv\",\"r\", encoding=\"cp949\") as f:\n", " data = f.readlines()\n", "\n", "# 구분자 재정의 하기\n", "import pandas as pd\n", "data = [_.replace('\",', '\"|') for _ in data]\n", "data = [_.split(\"|\") for _ in data]\n", "df = pd.DataFrame(data[1:]) # 컬럼을 제외한 데이터만 입력\n", "data[0][0] = data[0][0].replace('\\n','')\n", "df.columns = data[0][0].split(\",\")\n", "\n", "# 데이터 내부에 포함된 '\"' 지우기\n", "for _ in df.columns:\n", " df[_] = df[_].fillna('')\n", " df[_] = list(map(lambda x : x.replace('\"',''), df[_]))\n", "df.to_csv(\"data/레시피_조리법_20191220.csv\", sep=\"|\", index=None)\n", "df.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# {df_menus_codes[k]:v for k,v in df_tips.items()}\n", "# df_menu_codes[527]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# **2 CSV 다운로드 파일 살피기 및 합치기**\n", "## **01 레시피 다운로드 CSV**\n", "```\n", "['data/레시피_메뉴정보_20191220.csv',\n", " 'data/레시피_재료정보_20191220.csv',\n", " 'data/레시피_조리법_20191220.csv']\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 다운로드 파일을 활용한 자료활용\n", "from glob import glob\n", "food_list = sorted(glob(\"data/레시피*.*\"))\n", "food_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df_menus = pd.read_csv(food_list[0], encoding='cp949')\n", "df_menus.head(2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'food_list' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf_menus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfood_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cp949'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mdf_menus_codes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_menus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"레시피 코드\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mdf_menus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"레시피 이름\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_menus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mdf_tips\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'레시피 코드'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m과정팁\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m과정팁\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m과정팁\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'food_list' is not defined" ] } ], "source": [ "# 레시피 유용한 팁들\n", "# 자연어 검색엔진을 활용 및 적용하기\n", "\n", "import pandas as pd\n", "df_menus = pd.read_csv(food_list[0], encoding='cp949')\n", "df_menus_codes = {str(df_menus.loc[_,\"레시피 코드\"]):df_menus.loc[_,\"레시피 이름\"] for _ in range(len(df_menus))}\n", "df_tips = {df['레시피 코드'][_]:df.과정팁[_].strip() for _ in range(len(df.과정팁)) if len(df.과정팁[_].strip())>3}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_recipe = pd.read_csv(food_list[1], encoding='cp949')\n", "df_recipe.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_cook = pd.read_csv(food_list[2], sep=\"|\")\n", "# df_cook[\"레시피 코드\"] = list(map(lambda x:int(x), df_cook[\"레시피 코드\"]))\n", "# df_cook[\"요리설명순서\"] = list(map(lambda x:int(x), df_cook[\"요리설명순서\"]))\n", "df_cook.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_menus.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **02 CSV 파일 합치기**\n", "```\n", "['data/레시피_메뉴정보_20191220.csv',\n", " 'data/레시피_재료정보_20191220.csv',\n", " 'data/레시피_조리법_20191220.csv']\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "# 메뉴 수집용 Code 를 dict 으로 추출\n", "df_menus = pd.read_csv(food_list[0], encoding='cp949')\n", "df_menu_codes = {df_menus.loc[_][\"레시피 코드\"]:df_menus.loc[_][\"레시피 이름\"] for _ in range(len(df_menus))}\n", "\n", "# 세부 레시피 수집기 작업시작\n", "result = []\n", "for _ in df_menu_codes.keys():\n", " recipe = df_recipe[df_recipe['레시피 코드'] == _]\n", " result.append({recipe.loc[_]['재료명']:recipe.loc[_]['재료용량'] for _ in list(recipe.index)})\n", "\n", "# 메뉴 컬럼내용 수집 및 전처리 작업의 진행\n", "result = list(map(lambda x: json.dumps(x, ensure_ascii=False), result))\n", "df_menus.insert(3,'상세 레시피', result)\n", "df_menus_columns = ['레시피 이름', \"레시피 코드\", '상세 레시피', '음식분류코드',\\\n", " '음식분류','조리시간','분량','대표이미지 URL']\n", "df_menus = df_menus.reindex(columns=df_menus_columns)\n", "# df_menus = df_menus.loc[:, df_menus_columns]\n", "df_menus.to_csv(\"data/food_recipe_marf.csv\", index=None)\n", "df_menus.head(2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }