{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### 변수명으로 예약어를 사용하는 것은 무조건 감점입니다. 주의해주세요."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 모범답안 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "import string\n",
    "import html\n",
    "def countWords(url):\n",
    "    req = requests.get(url)\n",
    "    source = html.unescape(req.text).replace('><', '>\\n<')\n",
    "    line = source.split('\\n')\n",
    "    regex = re.compile('>.*?<')\n",
    "    words = []\n",
    "    for i in line:\n",
    "        if ('<script>' in i) or ('<style>' in i): continue\n",
    "        g = regex.search(i)\n",
    "        if g != None:\n",
    "            result = [r for r in g.group()[1:-1].strip().split()]\n",
    "            for r in range(len(result)):\n",
    "                for p in string.punctuation:\n",
    "                     result[r] = result[r].replace(p, '')\n",
    "            words.extend([w for w in result if w != ''])\n",
    "    resultDic = {}\n",
    "    for i in words:\n",
    "        resultDic[i] = len(source.split(i))\n",
    "    return resultDic\n",
    "urls = ['http://cse.koreatech.ac.kr', 'https://www.koreatech.ac.kr', 'http://www.naver.com', 'http://www.daum.net', 'http://www.nytimes.com']\n",
    "for u in urls:\n",
    "    print(countWords(u), end='\\n\\n')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 모범답안2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extractText(source): #HTML태그를 제외하고 순수 텍스트만 추출하는 함수\n",
    "    text = \"\" #순수 텍스트 식별자\n",
    "    while(1) :      \n",
    "        source = source[source.find('<'):] #'<'를 찾는다\n",
    "        if (source.find(\"<!--\") == 0) : \n",
    "                source = source[source.find(\"-->\")+3:] #주석 내용 건너뜀 \n",
    "        elif (source.find(\"<script\") == 0) :        \n",
    "                source = source[source.find(\"</script>\")+9:]  #자바스크립트 내용 건너뜀   \n",
    "        elif (source.find(\"<style\") == 0) :\n",
    "                source = source[source.find(\"</style>\")+9:] #스타일 내용 건너뜀\n",
    "        else :       \n",
    "            source = source[source.find('>')+1:] #'>'를 찾는다\n",
    "            for ch in source :    #'>' 이후부터 한글자씩 가져온다\n",
    "                if ch == '<' : break    # 다시 '<' 문자 등장하면 종료\n",
    "                elif ch == '\\t' or ch == '\\n' : continue    #탭,개행 문자는 순수 텍스트에 포함x\n",
    "                else : text += ch   #순수 텍스트 추가     \n",
    "            if text.endswith(\" \") == False : text += \" \" #공백으로 분리하기 위해 순수 텍스트 추출 후 뒤에 공백문자 추가\n",
    "        if(source.find('<') == -1):break #'<' 문자가 더 이상 없는 경우 무한루프종료 \n",
    "    return text \n",
    "import string\n",
    "\n",
    "def replaceAll(text, dic):  ##문자열을 사전에 맵핑된 문자로 대체로 하는 함수\n",
    "    for i,j in dic.items():\n",
    "        text = text.replace(i,j,text.count(i)) #문자열에 사전에 키와 같은 문자가 존재하면 키에 맵핑된 value값으로 대체한다.\n",
    "    return text   \n",
    "\n",
    "def stripSpecialChar(text): ##특수문자 제거 함수\n",
    "    keys = [ch for ch in string.punctuation] #구두문자를 key리스트로 만듬\n",
    "    keys.append(\"★\") #구두문자 이외의 특수문자를 key리스트에 추가\n",
    "    keys.append(\"☆\")\n",
    "    keys.append(\"※\")\n",
    "    keys.append(\"nbsp\")#html문자\n",
    "    keys.append(\"lt\")\n",
    "    keys.append(\"amp\")\n",
    "    keys.append(\"quot\")\n",
    "    keys.append(\"middot\")\n",
    "    keys.append(\"uarr\")\n",
    "    keys.append(\"rarr\")\n",
    "    keys.append(\"darr\")\n",
    "    keys.append(\"harr\")\n",
    "    \n",
    "    values = ['' for _ in keys]  #key리스트의 원소 수 만큼 ''문자를 원소를 갖는 value리스트를 만듬\n",
    "    replaceDic = dict(zip(keys,values)) #key리스트 key로 value리스트를 value값으로 하는 사전 만듬\n",
    "\n",
    "    return replaceAll(text,replaceDic) #추출한 순수 텍스트에서 사전에 맵핑된 제거할 문자들을 모두 제거\n",
    "def wordOfFreq(l): #단어의 빈도수를 사전으로 반환하는 함수\n",
    "    word = []   \n",
    "    for w in l: #리스트의 각 원소들이\n",
    "        if (w in word) == False : word.append(w) #word리스트에 포함된 단어가 아니면 단어를 리스트에 추가 (단어중복제거)          \n",
    "    freg = [l.count(w) for w in l] #리스트에서 단어의 출현 빈도수를 리스트로 만듬\n",
    "    \n",
    "    return dict(zip(word,freg)) #단어와 단어의 빈도수로 맵핑된 사전 반환\n",
    "import requests\n",
    "\n",
    "req = requests.get('http://cse.koreatech.ac.kr')\n",
    "source = req.text\n",
    "text =  extractText(source)  #순수 텍스트 추출\n",
    "text = stripSpecialChar(text) #특수문자 제거\n",
    "l = text.split()            #공백단위분리\n",
    "print(wordOfFreq(l)) #출현한 단어의 빈도수"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 모범답안3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "import string  \n",
    "\n",
    "def call(b):  \n",
    "    req = requests.get(b)\n",
    "    source = req.text\n",
    "    text1 = re.sub('<script.*?>.*?</script>', ' ', source, 0, re.I|re.S)\n",
    "    text2 = re.sub('<style.*?>.*?</style>', ' ', text1, 0, re.I|re.S)\n",
    "    text3 = re.sub('<.+?>', ' ', text2, 0, re.I|re.S)\n",
    "    text4 = re.sub('-->', ' ', text3, 0, re.I|re.S)\n",
    "    text5 = re.sub(\"[!|?|<|>|:|\\[|\\]|#|$|%|&|\\(|\\)|*|+|\\-|,|.|/|;|=|@|^|_|`|{|}|~|\\\"|\\'|\\\\\\|/]\",' ', text4, 0, re.I|re.S)\n",
    "    s=text5.split()\n",
    "    dic={}\n",
    "    j=0\n",
    "    for i in s:\n",
    "        if i in dic:\n",
    "            dic[i] += 1\n",
    "        else:\n",
    "            dic[i] = 1\n",
    "    print(dic)\n",
    "\n",
    "print('http://cse.koreatech.ac.kr')    \n",
    "call('http://cse.koreatech.ac.kr')\n",
    "print()\n",
    "\n",
    "print('https://www.koreatech.ac.kr')\n",
    "call('https://www.koreatech.ac.kr')\n",
    "print()\n",
    "\n",
    "print('http://www.naver.com')\n",
    "call('http://www.naver.com')\n",
    "print()\n",
    "\n",
    "print('http://www.daum.net')\n",
    "call('http://www.daum.net')\n",
    "print()\n",
    "\n",
    "print('http://www.nytimes.com')\n",
    "call('http://www.nytimes.com')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}