"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Image(url='https://git.io/JLdV0', width=700)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 234
},
"execution": {
"iopub.execute_input": "2020-12-31T14:53:05.601333Z",
"iopub.status.busy": "2020-12-31T14:53:05.600692Z",
"iopub.status.idle": "2020-12-31T14:53:05.618008Z",
"shell.execute_reply": "2020-12-31T14:53:05.617346Z"
},
"id": "esPfmEMMt3hb",
"outputId": "4f502652-e04b-46f9-e4e4-9330469cee2f"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/embedding.py:97: UserWarning: Argument `input_length` is deprecated. Just remove it.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"Model: \"sequential\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (Embedding) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from tensorflow.keras.layers import Embedding\n",
"\n",
"\n",
"model = tf.keras.Sequential()\n",
"\n",
"model.add(Embedding(input_dim=100,\n",
" output_dim=6,\n",
" name='embed-layer'))\n",
"\n",
"model.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rqwKaKkqt3hb"
},
"source": [
"### RNN 모델 만들기\n",
"\n",
"* **케라스 RNN 층:**\n",
" * `tf.keras.layers.SimpleRNN(units, return_sequences=False)`\n",
" * `tf.keras.layers.LSTM(..)`\n",
" * `tf.keras.layers.GRU(..)`\n",
" * `tf.keras.layers.Bidirectional()`\n",
"\n",
"* **`return_sequenes=?` 결정하기**\n",
" * 다층 RNN이면 마지막 층을 제외하고 모든 RNN 층을 `return_sequenes=True`로 지정합니다\n",
" * 마지막 RNN 층은 문제의 종류에 따라 결정됩니다:\n",
" * 다대다: -> `return_sequences=True`\n",
" * 다대일: -> `return_sequenes=False`"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 283
},
"execution": {
"iopub.execute_input": "2020-12-31T14:53:05.626167Z",
"iopub.status.busy": "2020-12-31T14:53:05.625414Z",
"iopub.status.idle": "2020-12-31T14:53:05.706279Z",
"shell.execute_reply": "2020-12-31T14:53:05.707065Z"
},
"id": "oy4iAXCHt3hc",
"outputId": "d132f90c-e516-462b-f9d0-1acd60ae0b49"
},
"outputs": [
{
"data": {
"text/html": [
"Model: \"sequential_1\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_1\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embedding (Embedding) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ simple_rnn_6 (SimpleRNN) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ simple_rnn_7 (SimpleRNN) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense (Dense) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embedding (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ simple_rnn_6 (\u001b[38;5;33mSimpleRNN\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ simple_rnn_7 (\u001b[38;5;33mSimpleRNN\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"## SimpleRNN 층으로 RNN 모델 만들기\n",
"from tensorflow.keras import Sequential\n",
"from tensorflow.keras.layers import Embedding\n",
"from tensorflow.keras.layers import SimpleRNN\n",
"from tensorflow.keras.layers import Dense\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(1000, 32))\n",
"model.add(SimpleRNN(32, return_sequences=True))\n",
"model.add(SimpleRNN(32))\n",
"model.add(Dense(1))\n",
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 283
},
"execution": {
"iopub.execute_input": "2020-12-31T14:53:05.715529Z",
"iopub.status.busy": "2020-12-31T14:53:05.714882Z",
"iopub.status.idle": "2020-12-31T14:53:06.010097Z",
"shell.execute_reply": "2020-12-31T14:53:06.009262Z"
},
"id": "hP_dd6GUt3hc",
"outputId": "09d5c531-635b-4cf8-a87c-c9147483ca37"
},
"outputs": [
{
"data": {
"text/html": [
"Model: \"sequential_2\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_2\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embedding_1 (Embedding) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ lstm (LSTM) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ lstm_1 (LSTM) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_1 (Dense) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embedding_1 (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ lstm (\u001b[38;5;33mLSTM\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ lstm_1 (\u001b[38;5;33mLSTM\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_1 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"## LSTM 층으로 RNN 모델 만들기\n",
"from tensorflow.keras.layers import LSTM\n",
"\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(10000, 32))\n",
"model.add(LSTM(32, return_sequences=True))\n",
"model.add(LSTM(32))\n",
"model.add(Dense(1))\n",
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 283
},
"execution": {
"iopub.execute_input": "2020-12-31T14:53:06.020415Z",
"iopub.status.busy": "2020-12-31T14:53:06.018360Z",
"iopub.status.idle": "2020-12-31T14:53:06.272493Z",
"shell.execute_reply": "2020-12-31T14:53:06.271598Z"
},
"id": "6UQfBf4yt3hc",
"outputId": "4d75be4b-ac9f-406d-ad9c-bc892d370f7f"
},
"outputs": [
{
"data": {
"text/html": [
"Model: \"sequential_3\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_3\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embedding_2 (Embedding) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ gru (GRU) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ gru_1 (GRU) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_2 (Dense) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embedding_2 (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ gru (\u001b[38;5;33mGRU\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ gru_1 (\u001b[38;5;33mGRU\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_2 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"## GRU 층으로 RNN 모델 만들기\n",
"from tensorflow.keras.layers import GRU\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(10000, 32))\n",
"model.add(GRU(32, return_sequences=True))\n",
"model.add(GRU(32))\n",
"model.add(Dense(1))\n",
"model.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TFSW6uDst3hc"
},
"source": [
"### 감성 분석 작업을 위한 RNN 모델 만들기"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 712
},
"execution": {
"iopub.execute_input": "2020-12-31T14:53:06.298278Z",
"iopub.status.busy": "2020-12-31T14:53:06.297321Z",
"iopub.status.idle": "2020-12-31T15:31:18.726835Z",
"shell.execute_reply": "2020-12-31T15:31:18.727434Z"
},
"id": "FszX7_Qft3hc",
"outputId": "72e25857-2fea-48cd-a205-17445889145d"
},
"outputs": [
{
"data": {
"text/html": [
"Model: \"sequential_4\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_4\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (Embedding) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ bidir-lstm (Bidirectional) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_3 (Dense) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_4 (Dense) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ bidir-lstm (\u001b[38;5;33mBidirectional\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_3 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_4 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m48s\u001b[0m 70ms/step - accuracy: 0.6306 - loss: 0.6214 - val_accuracy: 0.8212 - val_loss: 0.4148\n",
"Epoch 2/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m42s\u001b[0m 67ms/step - accuracy: 0.8726 - loss: 0.3233 - val_accuracy: 0.8396 - val_loss: 0.3850\n",
"Epoch 3/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m48s\u001b[0m 76ms/step - accuracy: 0.9335 - loss: 0.1872 - val_accuracy: 0.8602 - val_loss: 0.3763\n",
"Epoch 4/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m47s\u001b[0m 75ms/step - accuracy: 0.9536 - loss: 0.1316 - val_accuracy: 0.8546 - val_loss: 0.4389\n",
"Epoch 5/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m42s\u001b[0m 66ms/step - accuracy: 0.9780 - loss: 0.0731 - val_accuracy: 0.8304 - val_loss: 0.5120\n",
"Epoch 6/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m42s\u001b[0m 67ms/step - accuracy: 0.9814 - loss: 0.0554 - val_accuracy: 0.8342 - val_loss: 0.6907\n",
"Epoch 7/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m41s\u001b[0m 66ms/step - accuracy: 0.9880 - loss: 0.0423 - val_accuracy: 0.8338 - val_loss: 0.6170\n",
"Epoch 8/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m43s\u001b[0m 68ms/step - accuracy: 0.9734 - loss: 0.0709 - val_accuracy: 0.8434 - val_loss: 0.6825\n",
"Epoch 9/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m41s\u001b[0m 66ms/step - accuracy: 0.9873 - loss: 0.0380 - val_accuracy: 0.7790 - val_loss: 1.2537\n",
"Epoch 10/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m46s\u001b[0m 74ms/step - accuracy: 0.9843 - loss: 0.0494 - val_accuracy: 0.8054 - val_loss: 1.1899\n",
"\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 36ms/step - accuracy: 0.8133 - loss: 1.1650\n",
"테스트 정확도: 81.15%\n"
]
}
],
"source": [
"embedding_dim = 20\n",
"vocab_size = len(token_counts) + 2\n",
"\n",
"tf.random.set_seed(1)\n",
"\n",
"## 모델 생성\n",
"bi_lstm_model = tf.keras.Sequential([\n",
" tf.keras.layers.Embedding(\n",
" input_dim=vocab_size,\n",
" output_dim=embedding_dim,\n",
" name='embed-layer'),\n",
"\n",
" tf.keras.layers.Bidirectional(\n",
" tf.keras.layers.LSTM(64, name='lstm-layer'),\n",
" name='bidir-lstm'),\n",
"\n",
" tf.keras.layers.Dense(64, activation='relu'),\n",
"\n",
" tf.keras.layers.Dense(1, activation='sigmoid')\n",
"])\n",
"\n",
"bi_lstm_model.summary()\n",
"\n",
"## 컴파일과 훈련:\n",
"bi_lstm_model.compile(\n",
" optimizer=tf.keras.optimizers.Adam(1e-3),\n",
" loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),\n",
" metrics=['accuracy'])\n",
"\n",
"history = bi_lstm_model.fit(\n",
" train_data,\n",
" validation_data=valid_data,\n",
" epochs=10)\n",
"\n",
"## 테스트 데이터에서 평가\n",
"test_results= bi_lstm_model.evaluate(test_data)\n",
"print('테스트 정확도: {:.2f}%'.format(test_results[1]*100))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2020-12-31T15:31:18.733388Z",
"iopub.status.busy": "2020-12-31T15:31:18.732717Z",
"iopub.status.idle": "2020-12-31T15:31:19.095760Z",
"shell.execute_reply": "2020-12-31T15:31:19.096539Z"
},
"id": "yAYzzBm2t3hd"
},
"outputs": [],
"source": [
"if not os.path.exists('models'):\n",
" os.mkdir('models')\n",
"\n",
"\n",
"bi_lstm_model.save('models/Bidir-LSTM-full-length-seq.keras')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bOGhajp5t3hd"
},
"source": [
" * **짧은 시퀀스에 SimpleRNN 적용하기**"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"execution": {
"iopub.execute_input": "2020-12-31T15:31:19.114411Z",
"iopub.status.busy": "2020-12-31T15:31:19.113501Z",
"iopub.status.idle": "2020-12-31T15:31:19.116108Z",
"shell.execute_reply": "2020-12-31T15:31:19.115368Z"
},
"id": "TDbCSxpIt3hd"
},
"outputs": [],
"source": [
"def preprocess_datasets(\n",
" ds_raw_train,\n",
" ds_raw_valid,\n",
" ds_raw_test,\n",
" max_seq_length=None,\n",
" batch_size=32):\n",
"\n",
" ## 단계 1: (데이터셋 만들기 이미 완료)\n",
" ## 단계 2: 고유 토큰 찾기\n",
" try:\n",
" tokenizer = tfds.features.text.Tokenizer()\n",
" except AttributeError:\n",
" tokenizer = tfds.deprecated.text.Tokenizer()\n",
"\n",
" token_counts = Counter()\n",
"\n",
" for example in ds_raw_train:\n",
" tokens = tokenizer.tokenize(example[0].numpy()[0])\n",
" if max_seq_length is not None:\n",
" tokens = tokens[-max_seq_length:]\n",
" token_counts.update(tokens)\n",
"\n",
" print('어휘 사전 크기:', len(token_counts))\n",
"\n",
"\n",
" ## 단계 3: 텍스트 인코딩하기\n",
" try:\n",
" encoder = tfds.features.text.TokenTextEncoder(token_counts)\n",
" except AttributeError:\n",
" encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)\n",
"\n",
" def encode(text_tensor, label):\n",
" text = text_tensor.numpy()[0]\n",
" encoded_text = encoder.encode(text)\n",
" if max_seq_length is not None:\n",
" encoded_text = encoded_text[-max_seq_length:]\n",
" return encoded_text, label\n",
"\n",
" def encode_map_fn(text, label):\n",
" return tf.py_function(encode, inp=[text, label],\n",
" Tout=(tf.int64, tf.int64))\n",
"\n",
" ds_train = ds_raw_train.map(encode_map_fn)\n",
" ds_valid = ds_raw_valid.map(encode_map_fn)\n",
" ds_test = ds_raw_test.map(encode_map_fn)\n",
"\n",
" ## 단계 4: 배치 데이터 만들기\n",
" train_data = ds_train.padded_batch(\n",
" batch_size, padded_shapes=([-1],[]))\n",
"\n",
" valid_data = ds_valid.padded_batch(\n",
" batch_size, padded_shapes=([-1],[]))\n",
"\n",
" test_data = ds_test.padded_batch(\n",
" batch_size, padded_shapes=([-1],[]))\n",
"\n",
" return (train_data, valid_data,\n",
" test_data, len(token_counts))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"execution": {
"iopub.execute_input": "2020-12-31T15:31:19.128046Z",
"iopub.status.busy": "2020-12-31T15:31:19.127068Z",
"iopub.status.idle": "2020-12-31T15:31:19.129259Z",
"shell.execute_reply": "2020-12-31T15:31:19.129831Z"
},
"id": "JVuxsNTRt3hd"
},
"outputs": [],
"source": [
"def build_rnn_model(embedding_dim, vocab_size,\n",
" recurrent_type='SimpleRNN',\n",
" n_recurrent_units=64,\n",
" n_recurrent_layers=1,\n",
" bidirectional=True):\n",
"\n",
" tf.random.set_seed(1)\n",
"\n",
" # 모델 생성\n",
" model = tf.keras.Sequential()\n",
"\n",
" model.add(\n",
" Embedding(\n",
" input_dim=vocab_size,\n",
" output_dim=embedding_dim,\n",
" name='embed-layer')\n",
" )\n",
"\n",
" for i in range(n_recurrent_layers):\n",
" return_sequences = (i < n_recurrent_layers-1)\n",
"\n",
" if recurrent_type == 'SimpleRNN':\n",
" recurrent_layer = SimpleRNN(\n",
" units=n_recurrent_units,\n",
" return_sequences=return_sequences,\n",
" name='simprnn-layer-{}'.format(i))\n",
" elif recurrent_type == 'LSTM':\n",
" recurrent_layer = LSTM(\n",
" units=n_recurrent_units,\n",
" return_sequences=return_sequences,\n",
" name='lstm-layer-{}'.format(i))\n",
" elif recurrent_type == 'GRU':\n",
" recurrent_layer = GRU(\n",
" units=n_recurrent_units,\n",
" return_sequences=return_sequences,\n",
" name='gru-layer-{}'.format(i))\n",
"\n",
" if bidirectional:\n",
" recurrent_layer = Bidirectional(\n",
" recurrent_layer, name='bidir-'+recurrent_layer.name)\n",
"\n",
" model.add(recurrent_layer)\n",
"\n",
" model.add(tf.keras.layers.Dense(64, activation='relu'))\n",
" model.add(tf.keras.layers.Dense(1, activation='sigmoid'))\n",
"\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 319
},
"execution": {
"iopub.execute_input": "2020-12-31T15:31:19.135494Z",
"iopub.status.busy": "2020-12-31T15:31:19.134630Z",
"iopub.status.idle": "2020-12-31T15:31:23.127554Z",
"shell.execute_reply": "2020-12-31T15:31:23.127922Z"
},
"id": "WaDu41Pht3hd",
"outputId": "9de84334-2957-4ad2-b99a-31a6a18d5711"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"어휘 사전 크기: 58063\n"
]
},
{
"data": {
"text/html": [
"Model: \"sequential_5\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_5\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (Embedding) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ bidir-simprnn-layer-0 │ ? │ 0 (unbuilt) │\n",
"│ (Bidirectional) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_5 (Dense) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_6 (Dense) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ bidir-simprnn-layer-0 │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_5 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_6 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from tensorflow.keras.layers import Bidirectional\n",
"\n",
"\n",
"batch_size = 32\n",
"embedding_dim = 20\n",
"max_seq_length = 100\n",
"\n",
"train_data, valid_data, test_data, n = preprocess_datasets(\n",
" ds_raw_train, ds_raw_valid, ds_raw_test,\n",
" max_seq_length=max_seq_length,\n",
" batch_size=batch_size\n",
")\n",
"\n",
"\n",
"vocab_size = n + 2\n",
"\n",
"rnn_model = build_rnn_model(\n",
" embedding_dim, vocab_size,\n",
" recurrent_type='SimpleRNN',\n",
" n_recurrent_units=64,\n",
" n_recurrent_layers=1,\n",
" bidirectional=True)\n",
"\n",
"rnn_model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T15:31:23.140595Z",
"iopub.status.busy": "2020-12-31T15:31:23.140192Z",
"iopub.status.idle": "2020-12-31T15:37:30.416144Z",
"shell.execute_reply": "2020-12-31T15:37:30.416906Z"
},
"id": "Q43zdoSVt3hd",
"outputId": "7834d9ab-2045-49da-e587-e76a29bfbe67"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m34s\u001b[0m 47ms/step - accuracy: 0.5031 - loss: 0.7027 - val_accuracy: 0.5142 - val_loss: 0.6973\n",
"Epoch 2/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 45ms/step - accuracy: 0.5081 - loss: 0.7022 - val_accuracy: 0.4984 - val_loss: 0.7037\n",
"Epoch 3/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 45ms/step - accuracy: 0.5892 - loss: 0.6594 - val_accuracy: 0.7336 - val_loss: 0.5540\n",
"Epoch 4/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m41s\u001b[0m 45ms/step - accuracy: 0.7601 - loss: 0.5002 - val_accuracy: 0.7682 - val_loss: 0.5364\n",
"Epoch 5/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 45ms/step - accuracy: 0.8521 - loss: 0.3677 - val_accuracy: 0.6746 - val_loss: 0.6339\n",
"Epoch 6/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 46ms/step - accuracy: 0.8440 - loss: 0.3842 - val_accuracy: 0.7218 - val_loss: 0.5567\n",
"Epoch 7/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 45ms/step - accuracy: 0.8799 - loss: 0.2742 - val_accuracy: 0.7806 - val_loss: 0.5639\n",
"Epoch 8/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 46ms/step - accuracy: 0.9374 - loss: 0.1678 - val_accuracy: 0.7756 - val_loss: 0.6095\n",
"Epoch 9/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 48ms/step - accuracy: 0.9496 - loss: 0.1280 - val_accuracy: 0.7812 - val_loss: 0.6718\n",
"Epoch 10/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 45ms/step - accuracy: 0.9780 - loss: 0.0645 - val_accuracy: 0.7950 - val_loss: 0.7702\n"
]
}
],
"source": [
"rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),\n",
" loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),\n",
" metrics=['accuracy'])\n",
"\n",
"\n",
"history = rnn_model.fit(\n",
" train_data,\n",
" validation_data=valid_data,\n",
" epochs=10)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T15:37:30.422212Z",
"iopub.status.busy": "2020-12-31T15:37:30.421332Z",
"iopub.status.idle": "2020-12-31T15:37:51.102354Z",
"shell.execute_reply": "2020-12-31T15:37:51.102953Z"
},
"id": "WaiatV_It3he",
"outputId": "4d7ea4d7-96bd-42c7-892b-680704cd733f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 35ms/step - accuracy: 0.8008 - loss: 0.7598\n"
]
}
],
"source": [
"results = rnn_model.evaluate(test_data)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T15:37:51.108896Z",
"iopub.status.busy": "2020-12-31T15:37:51.108181Z",
"iopub.status.idle": "2020-12-31T15:37:51.112037Z",
"shell.execute_reply": "2020-12-31T15:37:51.111337Z"
},
"id": "x6QyEwQSt3he",
"outputId": "903cc73c-5b8d-4876-81c7-502f7a55bf95"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"테스트 정확도: 79.82%\n"
]
}
],
"source": [
"print('테스트 정확도: {:.2f}%'.format(results[1]*100))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KjPp6Koit3he"
},
"source": [
"## 연습문제:\n",
"\n",
"### 전체 길이를 사용한 시퀀스에 단방향 SimpleRNN 적용하기"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 301
},
"execution": {
"iopub.execute_input": "2020-12-31T15:37:51.118653Z",
"iopub.status.busy": "2020-12-31T15:37:51.118013Z",
"iopub.status.idle": "2020-12-31T15:37:55.390222Z",
"shell.execute_reply": "2020-12-31T15:37:55.389585Z"
},
"id": "VDMMjV1xt3he",
"outputId": "e7008c09-216a-4d81-d9c4-175ad27ef9bc"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"어휘 사전 크기: 87007\n"
]
},
{
"data": {
"text/html": [
"Model: \"sequential_6\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_6\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (Embedding) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ simprnn-layer-0 (SimpleRNN) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_7 (Dense) │ ? │ 0 (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_8 (Dense) │ ? │ 0 (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ embed-layer (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ simprnn-layer-0 (\u001b[38;5;33mSimpleRNN\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_7 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_8 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"batch_size = 32\n",
"embedding_dim = 20\n",
"max_seq_length = None\n",
"\n",
"train_data, valid_data, test_data, n = preprocess_datasets(\n",
" ds_raw_train, ds_raw_valid, ds_raw_test,\n",
" max_seq_length=max_seq_length,\n",
" batch_size=batch_size\n",
")\n",
"\n",
"\n",
"vocab_size = n + 2\n",
"\n",
"rnn_model = build_rnn_model(\n",
" embedding_dim, vocab_size,\n",
" recurrent_type='SimpleRNN',\n",
" n_recurrent_units=64,\n",
" n_recurrent_layers=1,\n",
" bidirectional=False)\n",
"\n",
"rnn_model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T15:37:55.402376Z",
"iopub.status.busy": "2020-12-31T15:37:55.401627Z",
"iopub.status.idle": "2020-12-31T15:59:01.339172Z",
"shell.execute_reply": "2020-12-31T15:59:01.338258Z"
},
"id": "eR8OJjstt3he",
"outputId": "d04b0a2a-8541-43b2-f14c-309fddad6c58"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m510s\u001b[0m 811ms/step - accuracy: 0.4958 - loss: 0.6984 - val_accuracy: 0.4940 - val_loss: 0.6965\n",
"Epoch 2/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m49s\u001b[0m 78ms/step - accuracy: 0.4987 - loss: 0.6997 - val_accuracy: 0.4992 - val_loss: 0.6942\n",
"Epoch 3/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m45s\u001b[0m 71ms/step - accuracy: 0.5027 - loss: 0.6966 - val_accuracy: 0.5048 - val_loss: 0.6986\n",
"Epoch 4/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m44s\u001b[0m 71ms/step - accuracy: 0.5086 - loss: 0.6977 - val_accuracy: 0.5106 - val_loss: 0.6943\n",
"Epoch 5/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m44s\u001b[0m 70ms/step - accuracy: 0.5022 - loss: 0.6965 - val_accuracy: 0.4880 - val_loss: 0.6963\n",
"Epoch 6/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m44s\u001b[0m 70ms/step - accuracy: 0.5037 - loss: 0.6949 - val_accuracy: 0.4888 - val_loss: 0.6948\n",
"Epoch 7/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m44s\u001b[0m 71ms/step - accuracy: 0.5087 - loss: 0.6945 - val_accuracy: 0.4882 - val_loss: 0.6940\n",
"Epoch 8/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m44s\u001b[0m 70ms/step - accuracy: 0.5089 - loss: 0.6940 - val_accuracy: 0.5038 - val_loss: 0.6934\n",
"Epoch 9/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m44s\u001b[0m 71ms/step - accuracy: 0.5099 - loss: 0.6937 - val_accuracy: 0.5032 - val_loss: 0.6928\n",
"Epoch 10/10\n",
"\u001b[1m625/625\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m48s\u001b[0m 76ms/step - accuracy: 0.5076 - loss: 0.6938 - val_accuracy: 0.5046 - val_loss: 0.6931\n"
]
}
],
"source": [
"rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),\n",
" loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),\n",
" metrics=['accuracy'])\n",
"\n",
"history = rnn_model.fit(\n",
" train_data,\n",
" validation_data=valid_data,\n",
" epochs=10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-P9l6HEHt3he"
},
"source": [
"# 부록"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bAw6wJ0jt3hf"
},
"source": [
"### A -- 데이터셋을 만드는 다른 방법: tensorflow_datasets 사용하기"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 863,
"referenced_widgets": [
"3b4d9bb0a39941aaa141d7b9b4672de6",
"d388900acb9941a6add1f5f2c01ee8ef",
"08baf40b79164ba58b7f54e7d6894798",
"2910e0e4e0dc40be9b1c4a17a933bba3",
"4aca86d884b54e2285beb4a4c73c151d",
"d5c7575da7034e62b169eb8b34f4ea8f",
"9a745e0ad61d42d6ac3a0cb719837166",
"1d5ab9228d6b41fab6d0d00a5faa78d9",
"233f0d0db2074aff98564dcf22977ca5",
"8ef0174f522e4e3ca3184af544c11ca8",
"2aa4da3ec27145afa5e30832f2e99299",
"9f77bc7ff59c4e579b98051a4d300d19",
"a64182a98a3349a092a9a7b5ffaa2640",
"67757e83c22b485bb71939b81b1608c7",
"2de5f50e8b3d4a6c92f8de801d07b770",
"51d4c6761e2c40ee8877ecde97a355c1",
"d6a54201e92544959af33a3c03b46c8f",
"b468f31529b7443a9a572f8e222aee4a",
"8f644e025858489b8d0394343d60eda0",
"3f53ca8d4f744b948b307dc61872b14d",
"16d9ffc19dd64dbeb68a3820f8425382",
"6bf9833e609740dfbc29593d929ebdbb",
"f692ce7dcd7a43cfb54d0e9fde1f4f91",
"836f0c01fd9744e895575f33788ecd27",
"5bb1c56a5b124a09800668fd26034baa",
"fbcacf466b6e41d7bfb576a2f6d732c7",
"b7cb1b7ea99a4606915e17f5275cbc36",
"1ba7576885e34195ad4fe9390a11a6aa",
"167882450876498eac51362d96c97c88",
"3af7023124d241c88a9b5eb7a4111cf9",
"fdcaec3b012745118c4ddecbfff6b10a",
"6fe89ad14d7b47069c93124af9c7f460",
"d39994dca0594389b8ad69db7f0bf202",
"b203e9e18e7e4b54a9fc69309cfd09f4",
"268a6f187e2e45d2964c0b4c506f8dc3",
"cc1e8e77db27483ead620f57ccc248ed",
"1a568b46ceab401c9afb81a2c67daae9",
"dacb938eb30b4314b19dc2fa0cfe072e",
"ea0660cbabb64e16ba09d6a7f92fb62f",
"af2ba6a0354249e09cdf7507fb0728de",
"d2204c9042954c47b15006eec6ad0cff",
"3c8793f3b76a4530a6a2a4505d21af46",
"d168410e6c674715b9ca091706ab8b10",
"2cf3b102155e4fa5996c457361322d4d",
"5a2e58cc092d45c7b7200c0d7a049de7",
"9b0f434fcbe94b5c8e549d50bba3499e",
"07aefc567ccb4bae8922b5743f0c4263",
"1e1da4ba82a54da9bdf56b0e8e10c0e8",
"fa2bf109fd694dbfbdff3141e32e5fa7",
"ab566920dc4b4b3bb1bf291382f741eb",
"69faee12ad3b4c38834fedba92ae51e0",
"acbb9ad6e76845b4bcd7ad3ed0ed9647",
"9dd8caed747e42ffbf9e51ff03986372",
"3bde1de4cf77402e89789e50c4cbad6b",
"3e0fcf6cca5b47b0bfcee9040a42a727",
"72be02bada8a4cc7a6be75794794a985",
"b7bef5ca120c494a84a38d94d73ff98b",
"52b799c9368e49cda48310b167ef8ae6",
"4b7166e8565844ac86e5866e352364cd",
"fdef134f1928450a8e892da9ece8ee82",
"67ad450b10074a33a965d8b4881c5413",
"151443216db0405eb677ed9657a71af1",
"0ab385a263be4c1a84a9b24ec634aa02",
"23ff081c36884b73bc1cb020e57feeb1",
"918863b92d3d4b289219a6e9fbab4af8",
"bee07060b9204b63912590a89ffe7e26",
"4b5b41463bcf4c33a96fb4c296663fce",
"d8059277d63b44c495b8af4ba2d7eeeb",
"b8de644511704b90a2ea442a73a11e54",
"8d21167554f84463b90675a4636dc907",
"263a5a6b1f0745478dfa028a46bfab00",
"01536dbb4aae4aa1aecf017724d53789",
"7db4cc09fd7a46568ea5eb3b6bf43017",
"efef4457dc9449d893467ee9dcc8199f",
"dd5d9208a0bc4e68be33025848969923",
"a390dbeadae447d797225493b47d7400",
"a283806c122b4cbfa2ec2a46bd731c9f",
"9aec54e386df4794b91c123277d035b3",
"1957865cc7144321935c2f188a0b557c",
"a54c50470b244ed28d44912f62b40d08",
"03655d92ec814cc283431c2a4f92d7d2",
"e2a730f5f91744beb7264665ce15873c",
"0876206cfb004d8892bdf3f703702dfe",
"0f7df469d1474060830ed8aa943263c0",
"2050f6d6b3b54a14bfa818b36ba0936d",
"1dc5f20241054618a11e1fc5cb908fd4",
"24badc9959c44d2e927b312db19d8ecc",
"5e62306381f343089aebba5ff826d317",
"90ea25e4ac97458486463badfea6886f",
"5d575bdcafc34b5e9e5a635cc702dff2",
"7dec95d73176435ba14f8f55c1494608",
"34a78cb3a96749dd90c7776cea62f12f",
"38137f68a5074c8a81a8856f4bed7475",
"d85e96a921c74059924b206039f76672",
"eeb6650decd046af9453c70d580f54c9",
"57a6e5acfd254342b5fff7d6a918fca9",
"31676bd879df434cabba7ca33326a1b6",
"4d9a47867d614d748ac2147da8f93c68",
"94be84906ad6484f842ee2313325a72f"
]
},
"execution": {
"iopub.execute_input": "2020-12-31T15:59:01.345251Z",
"iopub.status.busy": "2020-12-31T15:59:01.344304Z",
"iopub.status.idle": "2020-12-31T16:00:03.995238Z",
"shell.execute_reply": "2020-12-31T16:00:03.994443Z"
},
"id": "Bm4eEIM7t3hf",
"outputId": "b89f50e0-294f-40d9-b918-678d5f63e808"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tfds.core.DatasetInfo(\n",
" name='imdb_reviews',\n",
" full_name='imdb_reviews/plain_text/1.0.0',\n",
" description=\"\"\"\n",
" Large Movie Review Dataset. This is a dataset for binary sentiment\n",
" classification containing substantially more data than previous benchmark\n",
" datasets. We provide a set of 25,000 highly polar movie reviews for training,\n",
" and 25,000 for testing. There is additional unlabeled data for use as well.\n",
" \"\"\",\n",
" config_description=\"\"\"\n",
" Plain text\n",
" \"\"\",\n",
" homepage='http://ai.stanford.edu/~amaas/data/sentiment/',\n",
" data_dir='/root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',\n",
" file_format=tfrecord,\n",
" download_size=Unknown size,\n",
" dataset_size=Unknown size,\n",
" features=FeaturesDict({\n",
" 'label': ClassLabel(shape=(), dtype=int64, num_classes=2),\n",
" 'text': Text(shape=(), dtype=string),\n",
" }),\n",
" supervised_keys=('text', 'label'),\n",
" disable_shuffling=False,\n",
" nondeterministic_order=False,\n",
" splits={\n",
" },\n",
" citation=\"\"\"@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n",
" author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n",
" title = {Learning Word Vectors for Sentiment Analysis},\n",
" booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n",
" month = {June},\n",
" year = {2011},\n",
" address = {Portland, Oregon, USA},\n",
" publisher = {Association for Computational Linguistics},\n",
" pages = {142--150},\n",
" url = {http://www.aclweb.org/anthology/P11-1015}\n",
" }\"\"\",\n",
")\n",
"Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3b4d9bb0a39941aaa141d7b9b4672de6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dl Completed...: 0 url [00:00, ? url/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9f77bc7ff59c4e579b98051a4d300d19",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dl Size...: 0 MiB [00:00, ? MiB/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f692ce7dcd7a43cfb54d0e9fde1f4f91",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating splits...: 0%| | 0/3 [00:00, ? splits/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b203e9e18e7e4b54a9fc69309cfd09f4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating train examples...: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a2e58cc092d45c7b7200c0d7a049de7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.K9J6IH_1.0.0/imdb_reviews-train.tfrecor…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "72be02bada8a4cc7a6be75794794a985",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating test examples...: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4b5b41463bcf4c33a96fb4c296663fce",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.K9J6IH_1.0.0/imdb_reviews-test.tfrecord…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9aec54e386df4794b91c123277d035b3",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating unsupervised examples...: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "90ea25e4ac97458486463badfea6886f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.K9J6IH_1.0.0/imdb_reviews-unsupervised.…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.\n"
]
},
{
"data": {
"text/plain": [
"dict_keys([Split('train'), Split('test'), Split('unsupervised')])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imdb_bldr = tfds.builder('imdb_reviews')\n",
"print(imdb_bldr.info)\n",
"\n",
"imdb_bldr.download_and_prepare()\n",
"\n",
"datasets = imdb_bldr.as_dataset(shuffle_files=False)\n",
"\n",
"datasets.keys()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"execution": {
"iopub.execute_input": "2020-12-31T16:00:04.000279Z",
"iopub.status.busy": "2020-12-31T16:00:03.999377Z",
"iopub.status.idle": "2020-12-31T16:00:04.002868Z",
"shell.execute_reply": "2020-12-31T16:00:04.001894Z"
},
"id": "YHn1LXgkt3hf"
},
"outputs": [],
"source": [
"imdb_train = datasets['train']\n",
"imdb_train = datasets['test']"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2THlQCVWt3hf"
},
"source": [
"### B -- Tokenizer와 Encoder\n",
"\n",
" * `tfds.deprecated.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/Tokenizer\n",
" * `tfds.deprecated.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/TokenTextEncoder"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T16:00:04.012615Z",
"iopub.status.busy": "2020-12-31T16:00:04.011509Z",
"iopub.status.idle": "2020-12-31T16:00:04.015817Z",
"shell.execute_reply": "2020-12-31T16:00:04.015226Z"
},
"id": "tZrY4Fcnt3hf",
"outputId": "7dcdff9c-4fb7-458e-842e-abe605ffdec9"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"[4, 3, 1, 2]\n",
"[4, 3, 1, 2, 5, 5, 5, 5, 5, 5]\n"
]
}
],
"source": [
"vocab_set = {'a', 'b', 'c', 'd'}\n",
"encoder = tfds.deprecated.text.TokenTextEncoder(vocab_set)\n",
"print(encoder)\n",
"\n",
"print(encoder.encode(b'a b c d, , : .'))\n",
"\n",
"print(encoder.encode(b'a b c d e f g h i z'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NY6z1aTit3hg"
},
"source": [
"### C -- 케라스로 텍스트 전처리하기"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T16:00:04.023781Z",
"iopub.status.busy": "2020-12-31T16:00:04.022913Z",
"iopub.status.idle": "2020-12-31T16:00:04.028002Z",
"shell.execute_reply": "2020-12-31T16:00:04.027228Z"
},
"id": "LRO2xhDKt3hg",
"outputId": "146c3d22-8b38-4847-95b8-5c0a3402a40e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[1, 2, 3, 4], [5, 6, 7, 8]]\n"
]
},
{
"data": {
"text/plain": [
"array([[0, 0, 0, 0, 0, 0, 1, 2, 3, 4],\n",
" [0, 0, 0, 0, 0, 0, 5, 6, 7, 8]], dtype=int32)"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TOP_K = 200\n",
"MAX_LEN = 10\n",
"\n",
"tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n",
"\n",
"tokenizer.fit_on_texts(['this is an example', 'je suis en forme '])\n",
"sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme '])\n",
"print(sequences)\n",
"\n",
"tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T16:00:04.036271Z",
"iopub.status.busy": "2020-12-31T16:00:04.035409Z",
"iopub.status.idle": "2020-12-31T16:00:15.234946Z",
"shell.execute_reply": "2020-12-31T16:00:15.234078Z"
},
"id": "vePZ3UTot3hg",
"outputId": "d781aa43-956a-4927-d6eb-90d22f998e79",
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"25000\n",
"(25000, 500)\n"
]
}
],
"source": [
"TOP_K = 20000\n",
"MAX_LEN = 500\n",
"\n",
"tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n",
"\n",
"tokenizer.fit_on_texts(\n",
" [example['text'].numpy().decode('utf-8')\n",
" for example in imdb_train])\n",
"\n",
"x_train = tokenizer.texts_to_sequences(\n",
" [example['text'].numpy().decode('utf-8')\n",
" for example in imdb_train])\n",
"\n",
"print(len(x_train))\n",
"\n",
"\n",
"x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(\n",
" x_train, maxlen=MAX_LEN)\n",
"\n",
"print(x_train_padded.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9FYRIc-St3hg"
},
"source": [
"### D -- 임베딩"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"execution": {
"iopub.execute_input": "2020-12-31T16:00:15.250708Z",
"iopub.status.busy": "2020-12-31T16:00:15.249808Z",
"iopub.status.idle": "2020-12-31T16:00:15.258749Z",
"shell.execute_reply": "2020-12-31T16:00:15.257884Z"
},
"id": "fEuT0TEut3hh",
"outputId": "04df2b6d-56e8-4f64-ceda-ee963f7a88bd"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-9.69059765e-05 -0.0153358951 -0.0443523638 -0.00675549358]\n",
" [-0.04278313 0.015962366 -0.0261999015 0.0153260566]\n",
" [0.0448536389 0.0304714181 -0.022705555 -0.0474398397]\n",
" [-0.0213439222 -0.00352634117 -0.0117569789 -0.0493103862]\n",
" [-0.00115455315 -0.0452225 -0.0263084769 -0.0268569719]\n",
" [0.0242638923 0.0228262581 0.0192052834 -0.00259136036]]\n",
"TensorShape([6, 4])\n",
"[[-9.69059765e-05 -0.0153358951 -0.0443523638 -0.00675549358]]\n"
]
}
],
"source": [
"from tensorflow.keras.layers import Embedding\n",
"\n",
"\n",
"tf.random.set_seed(1)\n",
"embed = Embedding(input_dim=100, output_dim=4)\n",
"\n",
"inp_arr = np.array([1, 98, 5, 6, 67, 45])\n",
"tf.print(embed(inp_arr))\n",
"tf.print(embed(inp_arr).shape)\n",
"\n",
"tf.print(embed(np.array([1])))"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "ch16_part1.ipynb",
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}