{ "cells": [ { "cell_type": "markdown", "id": "4e391b33-ae37-4419-9b95-c0c88ecc9ee6", "metadata": {}, "source": [ "# Token 을 벡터로 임베딩\n", "- [pypi kss](https://github.com/hyunwoongko/kss)\n", "- [FastText 단어 유사도 구현하기](https://sunghee2.tistory.com/entry/FastText-%EB%8B%A8%EC%96%B4-%EC%9C%A0%EC%82%AC%EB%8F%84-%EA%B5%AC%ED%98%84%ED%95%98%EA%B8%B0)\n", "- [FastText model을 이용한 감정 분석 모델(pytorch)](https://happy-jihye.github.io/nlp/nlp-3/)\n", "- [Bag of Word with Bag 한글해설](https://m.blog.naver.com/antler07/221476398640)\n", "- [Word2vec from Scratch with NumPy](https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/)\n", "- [AwesomeKorean Data](https://github.com/songys/AwesomeKorean_Data)\n", "```bash\n", "! pip install python-mecab-ko -U\n", "```" ] }, { "cell_type": "code", "execution_count": 3, "id": "ac723d00-aab2-4c69-b18e-0cdb5ac7fbcd", "metadata": {}, "outputs": [], "source": [ "%reset -f" ] }, { "cell_type": "code", "execution_count": 7, "id": "f69578f0-23ca-4c51-84fb-5bcd41465042", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요',\n", " '다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다',\n", " '강남역 맛집 토끼정의 외부 모습.']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import kss\n", "import mecab\n", "text = \"회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.\"\n", "kss.split_morphemes(text, backend=\"mecab\", num_workers=8)\n", "# kss.split_sentences(text, backend=\"mecab\", num_workers=8, ignores=[])" ] }, { "cell_type": "code", "execution_count": 5, "id": "6d123730-a044-4e12-affb-941fb6cd2f5a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('자연주의', 'NNG'),\n", " ('쇼핑몰', 'NNG'),\n", " ('은', 'JX'),\n", " ('어떤', 'MM'),\n", " ('곳', 'NNG'),\n", " ('인가', 'VCP+EF'),\n", " ('?', 'SF')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from mecab import MeCab\n", "mecab = MeCab()\n", "mecab.pos('자연주의 쇼핑몰은 어떤 곳인가?')" ] }, { "cell_type": "code", "execution_count": 28, "id": "575ee639-3618-4574-880c-d10468caa54f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Morpheme(span=Span(start=0, end=3), surface='즐거운', feature=Feature(pos='VA+ETM', semantic=None, has_jongseong=True, reading='즐거운', type='Inflect', start_pos='VA', end_pos='ETM', expression='즐겁/VA/*+ᆫ/ETM/*')),\n", " Morpheme(span=Span(start=4, end=6), surface='하루', feature=Feature(pos='NNG', semantic=None, has_jongseong=False, reading='하루', type=None, start_pos=None, end_pos=None, expression=None)),\n", " Morpheme(span=Span(start=7, end=9), surface='보내', feature=Feature(pos='VV', semantic=None, has_jongseong=False, reading='보내', type=None, start_pos=None, end_pos=None, expression=None)),\n", " Morpheme(span=Span(start=9, end=11), surface='세요', feature=Feature(pos='EP+EF', semantic=None, has_jongseong=False, reading='세요', type='Inflect', start_pos='EP', end_pos='EF', expression='시/EP/*+어요/EF/*')),\n", " Morpheme(span=Span(start=11, end=12), surface='!', feature=Feature(pos='SF', semantic=None, has_jongseong=None, reading=None, type=None, start_pos=None, end_pos=None, expression=None))]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mecab.parse('즐거운 하루 보내세요!')" ] }, { "cell_type": "code", "execution_count": 15, "id": "ce4c99ec-7256-4f0f-9497-8cc84efd3361", "metadata": {}, "outputs": [], "source": [ "from gensim.models import FastText\n", "FastText.load_fasttext_format(" ] }, { "cell_type": "code", "execution_count": 1, "id": "ab3a97a8-c233-45b2-b913-f3a6cd7746ed", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zsh:1: parse error near `&'\n" ] } ], "source": [ "!curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=1CQT4Sear6NKxGiZIW3WpAGkTanO0azrl\" > /dev/null\n", "!curl -Lb ./cookie \\\"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1CQT4Sear6NKxGiZIW3WpAGkTanO0azrl\\\" -o wiki_20190620_small.txt" ] }, { "cell_type": "code", "execution_count": 2, "id": "4b68c8db-40b1-4558-a0fb-3a0eb5e6bf27", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zsh:1: no matches found: \"https://drive.google.com/uc?export=download\n" ] } ], "source": [ "!curl -c ./cookie -s -L \\\"https://drive.google.com/uc?export=download&id=1CQT4Sear6NKxGiZIW3WpAGkTanO0azrl\\\" > /dev/null" ] }, { "cell_type": "code", "execution_count": 12, "id": "9238249a-9fd9-490b-826a-94a067da2487", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['영등포구청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from mecab import MeCab\n", "mecab = MeCab()\n", "mecab.morphs('영등포구청역에 있는 맛집 좀 알려주세요.')" ] }, { "cell_type": "code", "execution_count": null, "id": "a06e19f3-fddd-4a58-a643-ed849a854f08", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }