{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Excel 시트내용 검색**\n",
    "https://wikidocs.net/15103"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%time\n",
    "# file = 'csv/Food-data.xlsx'\n",
    "# import xlrd\n",
    "# xls        = xlrd.open_workbook(file)\n",
    "# sheet_list = xls.sheet_names()[4:-8]\n",
    "\n",
    "# import pandas as pd\n",
    "# result, no = {}, 0\n",
    "# for sheet in sheet_list:\n",
    "#     df = pd.read_excel(file, sheet_name=sheet)\n",
    "#     df = df.fillna(method='ffill')\n",
    "#     df = df.loc[:,['메뉴명', '단품명', '1인량']]\n",
    "#     no += 1\n",
    "#     if no % 10 == 0: \n",
    "#         print('{:4,} th sheet working'.format(no))\n",
    "#     # 컬럼의 오타를 변경\n",
    "#     df.메뉴명 = list(map(lambda x : x.replace('\\n','') , list(df.메뉴명)))  \n",
    "#     temps   = {}\n",
    "#     for menu in list(set(df.메뉴명)):\n",
    "#         df_temp = df[df.메뉴명 == menu]\n",
    "#         foods   = {df_temp.iloc[idx,1]: df_temp.iloc[idx,2]  for idx in range(len(df_temp.단품명))}\n",
    "#         temps[menu] = foods\n",
    "#     for menu in temps.keys():\n",
    "#         # 기존에 없던 메뉴가 확인시\n",
    "#         if menu not in result.keys():\n",
    "#             result[menu] = temps[menu]\n",
    "#         # 메뉴가 있으면 레시피만 추가\n",
    "#         else:\n",
    "#             # 재료가 없으면 재료이름과 해당 내용을 추라\n",
    "#             for recipie in temps[menu].keys():\n",
    "#                 if recipie not in result[menu].keys():\n",
    "#                     result[menu][recipie] = temps[menu][recipie]\n",
    "# import json\n",
    "# data = json.dumps(result, ensure_ascii=False)\n",
    "# with open('food_recipie.json', 'w', encoding='UTF-8-sig') as file:\n",
    "#      file.write(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Menu List Rerange**\n",
    "메뉴 데이터를 Python Pandas 로 불러오기\n",
    "\n",
    "## **1 Read xls to Pandas**\n",
    "메뉴 데이터를 Python Pandas 로 불러오기\n",
    "1. **[공학도를 위한 파이썬 | OpenpyXl](https://wikidocs.net/15103)**\n",
    "1. 후처리 편의성을 위해선 **Pandas** 가 유용하다고 판단"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file = 'data/2018_muyong_menu.xlsx'\n",
    "\n",
    "import xlrd\n",
    "xls        = xlrd.open_workbook(file)\n",
    "sheet_list = xls.sheet_names()\n",
    "len(sheet_list), sheet_list[::10]\n",
    "\n",
    "# from openpyxl import Workbook, load_workbook\n",
    "# wb        = load_workbook(file_name)\n",
    "# st_names  = [ws.title  for ws in wb.worksheets]`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>월 ( 1월22일)</td>\n",
       "      <td>화 (23일 )</td>\n",
       "      <td>수 (24일 )</td>\n",
       "      <td>목 (25일 )</td>\n",
       "      <td>금 (26일)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>청양콩나물국</td>\n",
       "      <td>조갯살미역</td>\n",
       "      <td>설렁탕/소면</td>\n",
       "      <td>참치김치찌개</td>\n",
       "      <td>훈제오리야채볶음밥</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>돈육맥적구이</td>\n",
       "      <td>허니버터치킨</td>\n",
       "      <td>탕수육</td>\n",
       "      <td>제육고추장불고기</td>\n",
       "      <td>돈까스/소스</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>중</td>\n",
       "      <td>마파두부</td>\n",
       "      <td>난자완스</td>\n",
       "      <td>오징어어묵볶음</td>\n",
       "      <td>생선까스튀김</td>\n",
       "      <td>어묵국</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0    1           2         3         4          5          6\n",
       "0 NaN  NaN         NaN       NaN       NaN        NaN        NaN\n",
       "1 NaN  NaN  월 ( 1월22일)  화 (23일 )  수 (24일 )  목 (25일 )     금 (26일)\n",
       "2 NaN  NaN      청양콩나물국     조갯살미역    설렁탕/소면     참치김치찌개  훈제오리야채볶음밥\n",
       "3 NaN  NaN      돈육맥적구이    허니버터치킨       탕수육   제육고추장불고기     돈까스/소스\n",
       "4 NaN    중        마파두부      난자완스   오징어어묵볶음     생선까스튀김        어묵국"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 대표적인 Sheet 내용 살펴보기\n",
    "import pandas as pd\n",
    "df = pd.read_excel(file, sheet_name=sheet_list[3],header=None)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **2 Pre Processing**\n",
    "line 별 작업결과를 바탕으로 정리하기\n",
    "1. **Pandas to Series**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[nan, nan, '월 ( 1월22일)', '화 (23일 )', '수 (24일 )', '목 (25일 ) ', '금 (26일)']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 작업의 시작\n",
    "table_row = df.iloc[1,:]\n",
    "list(table_row.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[False, False, True, True, True, True, True]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# notnull() : True Value Count\n",
    "table_row.notnull().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[True, True, False, False, False, False, False]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# isna() : Null Count\n",
    "table_row.isna().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['월 ( 1월22일)', '화 (23일 )', '수 (24일 )', '목 (25일 ) ', '금 (26일)']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table_row.dropna().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'월 ( 1월22일) 화 (23일 ) 수 (24일 ) 목 (25일 )  금 (26일)'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\" \".join(table_row.dropna().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['월', '월', '화', '수', '목', '금']\n",
      "주간 데이터 작업시작\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "tokens = re.findall(r\"(월|화|수|목|금)\", \n",
    "                    \" \".join(table_row.dropna().tolist()))\n",
    "print(tokens)\n",
    "if len(tokens) >= 4:\n",
    "    print(\"주간 데이터 작업시작\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **3 점심/ 저녁 메뉴목록 수집하기**\n",
    "line 별 작업결과를 바탕으로 정리하기\n",
    "1. **Pandas to Series**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[nan, nan, '청양콩나물국', '조갯살미역', '설렁탕/소면', '참치김치찌개', '훈제오리야채볶음밥']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 작업의 시작\n",
    "table_row = df.iloc[2,:]\n",
    "list(table_row.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[nan, '식수', nan, nan, nan, nan, nan]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 작업의 시작\n",
    "table_row = df.iloc[12,:]\n",
    "list(table_row.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# notnull() : True Value Count\n",
    "sum(table_row.isna().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['식수']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r\"식수\", \" \".join(list(table_row.dropna())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r\"식수\", \"알고리즘 수수수\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **4 세부 수집기 작업하기**\n",
    "pandas 와 regex 활용\n",
    "https://stackoverflow.com/questions/52173161/getting-a-list-of-indices-where-pandas-boolean-series-is-true"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xlrd, re\n",
    "import pandas as pd\n",
    "\n",
    "# xls 파일에서 sheet list 추출\n",
    "def menu_sheet_names(file):\n",
    "    xls        = xlrd.open_workbook(file)\n",
    "    return file, xls.sheet_names()\n",
    "\n",
    "# 날짜 인덱스로 검색여부 확인\n",
    "def menu_valid_check(file, sheet_list, sheet_num):\n",
    "    df = pd.read_excel(file, sheet_name=sheet_list[sheet_num],header=None)\n",
    "    for _ in range(df.shape[0]): \n",
    "        tokens = re.findall(r\"(월|화|수|목|금)\", \n",
    "                        \" \".join(df.iloc[_,:].dropna().tolist()))\n",
    "        if len(tokens) >= 4:\n",
    "            #print(\"주간 데이터 작업시작\")\n",
    "            return df, _ #break\n",
    "    print(sheet_num, \"작업할 내용이 없습니다\")\n",
    "    return df, None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 유효값 데이터 추출하기\n",
    "def menu_valid_data(df, col_start):\n",
    "\n",
    "    # valid sector 추출함수\n",
    "    def menu_filter(df, rows, cols, col_name):\n",
    "        menu = df.iloc[rows[0]:rows[1], cols]\n",
    "        menu.columns = col_name\n",
    "        return menu.reset_index(drop=True)\n",
    "\n",
    "    # col_start : 요일값 인덱스 주소\n",
    "    # col_valid : 요일값 포함된 컬럼들\n",
    "    # col_data : 요얄값 컬럼 중 12개 이상 값이 있는 컬럼 만 추출\n",
    "    col_valid = list(df.iloc[col_start,:].dropna().index)\n",
    "    col_data = [ _ for _ in col_valid  if sum(df.iloc[:,_].notnull()) > 12] \n",
    "    col_name = df.iloc[col_start, col_data].tolist()\n",
    "    col_name = [_.strip().replace(\" \",\"\") for _ in col_name]\n",
    "\n",
    "    # 요일별 12개 이상 유효값 있는 필드 인덱스 추출\n",
    "    row_valid = df.iloc[:, col_data[0]].isna()\n",
    "    row_valid = list(row_valid[row_valid].index)[:3]\n",
    "    row_valid[0] = col_start\n",
    "    limit_row = [ [row_valid[0]+1, row_valid[1]], [1+row_valid[1], row_valid[2]-1] ]\n",
    "    result = []\n",
    "    for no,_ in enumerate(limit_row):\n",
    "        midname = {0:\"중식\", 1:\"석식\"}\n",
    "        menu_temp = menu_filter(df, _, col_data, col_name)\n",
    "        result += [(_,midname[no],\";\".join(menu_temp[_]))    for _ in list(menu_temp.columns)]\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 53/53 [00:04<00:00, 12.51it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "52 작업할 내용이 없습니다\n",
      "(462, 3)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>화(2일)</td>\n",
       "      <td>중식</td>\n",
       "      <td>사골우거지해장국;제육간장불고기;꽃게양념무침;두부구이&amp;양념장;양상추샐러드/흑임자D;구...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>수(3일)</td>\n",
       "      <td>중식</td>\n",
       "      <td>소고기미역국;묵은지닭찜;오징어까스&amp;어니언소스;어묵간장볶음;멕시칸샐러드;비빔막국수;봄...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>목(4일)</td>\n",
       "      <td>중식</td>\n",
       "      <td>얼갈이된장국;돈육맥적구이;해물누룽지탕;건파래실치볶음;양상추샐러드/키위D;진미채야채초...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0   1                                                  2\n",
       "0  화(2일)  중식  사골우거지해장국;제육간장불고기;꽃게양념무침;두부구이&양념장;양상추샐러드/흑임자D;구...\n",
       "1  수(3일)  중식  소고기미역국;묵은지닭찜;오징어까스&어니언소스;어묵간장볶음;멕시칸샐러드;비빔막국수;봄...\n",
       "2  목(4일)  중식  얼갈이된장국;돈육맥적구이;해물누룽지탕;건파래실치볶음;양상추샐러드/키위D;진미채야채초..."
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file = 'data/2018_muyong_menu.xlsx'\n",
    "file_name, sheet_list = menu_sheet_names(file)\n",
    "result = []\n",
    "from tqdm import tqdm\n",
    "for _ in tqdm(range(len(sheet_list))):\n",
    "    table_sheet, i = menu_valid_check(file_name, sheet_list, _)\n",
    "    if i:\n",
    "        try: result += menu_valid_data(table_sheet, i)\n",
    "        except: pass\n",
    "menu_table = pd.DataFrame(result)\n",
    "print(menu_table.shape)\n",
    "menu_table.head(3)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}