In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.2.4'

# LSTM으로 텍스트 생성하기

이 노트북은 [케라스 창시자에게 배우는 딥러닝](https://tensorflow.blog/deep-learning-with-python/) 책의 8장 1절의 코드 예제입니다. 책에는 더 많은 내용과 그림이 있습니다. 이 노트북에는 소스 코드에 관련된 설명만 포함합니다. 이 노트북의 설명은 케라스 버전 2.2.2에 맞추어져 있습니다. 케라스 최신 버전이 릴리스되면 노트북을 다시 테스트하기 때문에 설명과 코드의 결과가 조금 다를 수 있습니다.

----

[...]

## 글자 수준의 LSTM 텍스트 생성 모델 구현

이런 아이디어를 케라스로 구현해 보죠. 먼저 언어 모델을 학습하기 위해 많은 텍스트 데이터가 필요합니다. 위키피디아나 반지의 제왕처럼 아주 큰 텍스트 파일이나 텍스트 파일의 묶음을 사용할 수 있습니다. 이 예에서는 19세기 후반 독일의 철학자 니체의 글을 사용하겠습니다(영어로 번역된 글입니다). 학습할 언어 모델은 일반적인 영어 모델이 아니라 니체의 문체와 특정 주제를 따르는 모델일 것입니다.

## 데이터 전처리

먼저 말뭉치를 다운로드하고 소문자로 바꿉니다:

In [2]:
import keras
import numpy as np

path = keras.utils.get_file(
 'nietzsche.txt',
 origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('말뭉치 크기:', len(text))

말뭉치 크기: 600893


In [3]:
type(text)

str

그 다음 `maxlen` 길이를 가진 시퀀스를 중복하여 추출합니다. 추출된 시퀀스를 원-핫 인코딩으로 변환하고 크기가 `(sequences, maxlen, unique_characters)`인 3D 넘파이 배열 `x`로 합칩니다. 동시에 훈련 샘플에 상응하는 타깃을 담은 배열 `y`를 준비합니다. 타깃은 추출된 시퀀스 다음에 오는 원-핫 인코딩된 글자입니다.

In [4]:
# 60개 글자로 된 시퀀스를 추출합니다.
maxlen = 60

# 세 글자씩 건너 뛰면서 새로운 시퀀스를 샘플링합니다.
step = 3

# 추출한 시퀀스를 담을 리스트
sentences = []

# 타깃(시퀀스 다음 글자)을 담을 리스트
next_chars = []

for i in range(0, len(text) - maxlen, step):
 sentences.append(text[i: i + maxlen])
 next_chars.append(text[i + maxlen])
print('시퀀스 개수:', len(sentences))

# 말뭉치에서 고유한 글자를 담은 리스트
chars = sorted(list(set(text)))
print('고유한 글자:', len(chars))
# chars 리스트에 있는 글자와 글자의 인덱스를 매핑한 딕셔너리
char_indices = dict((char, chars.index(char)) for char in chars)

# 글자를 원-핫 인코딩하여 0과 1의 이진 배열로 바꿉니다.
print('벡터화...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
 for t, char in enumerate(sentence):
 x[i, t, char_indices[char]] = 1
 y[i, char_indices[next_chars[i]]] = 1

시퀀스 개수: 200278
고유한 글자: 57
벡터화...


## 네트워크 구성

이 네트워크는 하나의 `LSTM` 층과 그 뒤에 `Dense` 분류기가 뒤따릅니다. 분류기는 가능한 모든 글자에 대한 소프트맥스 출력을 만듭니다. 순환 신경망이 시퀀스 데이터를 생성하는 유일한 방법은 아닙니다. 최근에는 1D 컨브넷도 이런 작업에 아주 잘 들어 맞는다는 것이 밝혀졌습니다.

In [5]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

타깃이 원-핫 인코딩되어 있기 때문에 모델을 훈련하기 위해 `categorical_crossentropy` 손실을 사용합니다:

In [6]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## 언어 모델 훈련과 샘플링

훈련된 모델과 시드로 쓰일 간단한 텍스트가 주어지면 다음과 같이 반복하여 새로운 텍스트를 생성할 수 있습니다.

1.	지금까지 생성된 텍스트를 주입하여 모델에서 다음 글자에 대한 확률 분포를 뽑습니다.
2.	특정 온도로 이 확률 분포의 가중치를 조정합니다.
3.	가중치가 조정된 분포에서 무작위로 새로운 글자를 샘플링합니다.
4.	새로운 글자를 생성된 텍스트의 끝에 추가합니다.

다음 코드는 모델에서 나온 원본 확률 분포의 가중치를 조정하고 새로운 글자의 인덱스를 추출합니다(샘플링 함수입니다):

In [7]:
def sample(preds, temperature=1.0):
 preds = np.asarray(preds).astype('float64')
 preds = np.log(preds) / temperature
 exp_preds = np.exp(preds)
 preds = exp_preds / np.sum(exp_preds)
 probas = np.random.multinomial(1, preds, 1)
 return np.argmax(probas)

마지막으로 다음 반복문은 반복적으로 훈련하고 텍스트를 생성합니다. 에포크마다 학습이 끝난 후 여러가지 온도를 사용해 텍스트를 생성합니다. 이렇게 하면 모델이 수렴하면서 생성된 텍스트가 어떻게 진화하는지 볼 수 있습니다. 온도가 샘플링 전략에 미치는 영향도 보여 줍니다.

In [8]:
import random
import sys

random.seed(42)
start_index = random.randint(0, len(text) - maxlen - 1)

# 60 에포크 동안 모델을 훈련합니다
for epoch in range(1, 60):
 print('에포크', epoch)
 # 데이터에서 한 번만 반복해서 모델을 학습합니다
 model.fit(x, y, batch_size=128, epochs=1)

 # 무작위로 시드 텍스트를 선택합니다
 seed_text = text[start_index: start_index + maxlen]
 print('--- 시드 텍스트: "' + seed_text + '"')

 # 여러가지 샘플링 온도를 시도합니다
 for temperature in [0.2, 0.5, 1.0, 1.2]:
 print('------ 온도:', temperature)
 generated_text = seed_text
 sys.stdout.write(generated_text)

 # 시드 텍스트에서 시작해서 400개의 글자를 생성합니다
 for i in range(400):
 # 지금까지 생성된 글자를 원-핫 인코딩으로 바꿉니다
 sampled = np.zeros((1, maxlen, len(chars)))
 for t, char in enumerate(generated_text):
 sampled[0, t, char_indices[char]] = 1.

 # 다음 글자를 샘플링합니다
 preds = model.predict(sampled, verbose=0)[0]
 next_index = sample(preds, temperature)
 next_char = chars[next_index]

 generated_text += next_char
 generated_text = generated_text[1:]

 sys.stdout.write(next_char)
 sys.stdout.flush()
 print()

에포크 1
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the sermally the self the serting the suppority the sertion the fall the present the and the sertion the serting the present the such the proble the serting the subtersal the self the self-the self--the expression and serpinion an one the serming the feeling and the self--the sertion of the self the self-destray and the self-the sense the self the the serming the subter the self-the self-the sel
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the experience of the rasure as the worl and conception of the will feel the world the proble the serming to the woll the supters in the man and the call pereal in the serpor to the self--an into the nothing the sungerman the most subtertation of the enour to we deself of the self? even the called in the would the sert of
the seated the man entration an 

to medivence be aninc polation from unsourilite--alsose" origh them and was rich-predoorhal "pretame
" capacity
in of the sriles,
than centaprobsewd cronds endles. like whithe "prefuicati
에포크 5
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the sense the the men and the present and the artistic and the sense more and all the sufference of the sense and the sense and the sufference of the present the sense and the sense and and all the sense of the and all the sense the sense of the sense of the same to the end to the sense of the sense and present of the sense and the sense and the sense of the sense of the same and the artistic pr
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the philosophers and in the conscience and the same to mentic and one call other the morality and eventured and short, the possible and religions and for the sund and t

 This is separate from the ipykernel package so we can avoid doing imports until


ledge, in account by the steaksness a an causinexs, at the is ge's
belled never between we are bescately, in ay instince, as the quick, or german flany enjoymous avistable, hungs, enk and brothers (everyourourse mabone and for perfecting on
them.


------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for a acture,
caste in the greettes
along-ull1!

an
here the cyish now every depth" to
truth pay up would is over
vain:--a
been sympathy
in avimying, and
whounlines,
yot which the plentenenent of
evythidd and germans
extenday back
science, wgovily--amougd!--ay warm her, a
mgatual will the ready papalo the prevailly does the name were, the davedus me snow acal
rivemom and the unprude
pleasing by musl
에포크 6
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through former to the sense of the sense of the sense of the sentiment, and the conceive to the sense and the seriousne

through for the strange cautave. indemitality" and came--for the wolly allow place be maynch fined"
say one induced more oprruse that
it was another of roust de some attainrature of the morely, they a eventional coarse heinial cofnete to the variety so is work, but self-to have these waslise philosopher we hinger nearsing invasa, with personality soun and having lift, it is not kill to be coursely his oppos
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through foreth well? to inssmact of broughteter than account stage--"free;
andcically have platoness in oretheri the suberament more originate from the science sounds promised. the edellugent of
pain intrustate a
lack in
the educa
uncecesc, nerr body peryonal woll will also could knowledge could hese as taithougixty. ruthed of the
causes to namely--hese self-choeragian above the religion whethic quigion and
에포크 10
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the sl

through for the compligety with the free spirit, and because at other contate to be ancient been the determine and the soul with the struggle his own one of stepstility, the barbary to the formult in sense of a those, all the learned will to all the moral value of which they be the opinion which of all the sense of account to self-twould for the end of mentives of the even himself in order to all the servic
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for thereby
invade of history gentle of ancient ut!
noot. if period is these--and others to means of oldece is of
all way for the cast, falsion, and good wigh )zersing and itself though,
whose have charficed and delights of religion" is" in order
and the high un-ruled germany
asalpsure. it is certain the soul.


22heds? vhiritable action to be kepty the spirit--ss thewer colitating be
wistor pieck c
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through foorshury
of which
adxianpted
at were he

through for instance, and the and the present and proper the present and below to the sense of the foreth and souls and the souls of the same things is the forether and souls of the conduct and the souls of the promise the great the souls of the predication of the profusion of the conduct and the souls of the delights and the predicated of the posited of the present and souls of the present and world and th
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for in the measing of the religion and etiltary of the greatest at all the general and for our vituoled and opposion, and
as a soul, as the preversious and
below every being the such and changed to
wagner? the german of the "falsify are becomes the moral whole of the discipline to the religious exercised and man so much that it is herever of ethicd in every for the existed by the strong and the perh
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through force (and the moutaproh, but could so n

through for the sense of the soul and the sense of the consequring to the sentiments of the sense of the sense of the sense of the soul, and the soul the conduct of the sense of the same the sense of the present the belief of the sense of the subtle the most and the sense of the most and the present there is a god, as a soul and the development of the sense of the sense of the sense of the same the sense of
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the present and revealing superiorous let us itself that the development of prederated to fly to his false and combite for the entire soul; there is it is the
gensible, with the character that every persons and desire and self-complition and a sense of mankind and interest interests of faith--it is for any life this account itself and community and something and relent themselves of the sign the
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through former, in the lougied cxcollinest the p

through for the soul and the spirit has a conscience of the acts of the desire and in the soul of the experience of the profound the profound the such a man who has to been the artists and soul of the such a man and such a soul and with the state to the soul of the such a considered to the state to the future, and also the conscience of the such a soul it is the such a compared to the profound with the worl
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for all deteriorated to the conception of the acts and make affordation of the great been with the philosophest action as the present and soul of the strength the actually some one value of the very not the present perfect to the philosopher superficial such as "more hand its modern de any deed and the sense--such a conscience of the recoure of life, and sufficiently of the perfect the
other has the
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for
being, is asserting pationation been

through formult of the conformental and the sure, and and the spirit of the suffering of the sense of the soul; the soul, and in the soul, and the ends the soul, and the world and the super-of the soul, and the soul, the world of the soul, and the sense of the same the suffering to the sense of the subjection of the same the sense of the same the subject of the contradictious and the conscience, the sign a 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through formult religious, in the fact that even the more something perhaps of the
will, and the fact of the present,
as the importion of a sing the task, in the
sense of the made of the conscience of the representation, and not to him order and conception of the contempt to the sense of the sense of himself to the greek of the condition of man, as the generally sufferings and insight to the means of the ot
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through formerity the coating sminests is not
re

through former and something the man become the sense of the democratic such a soul and who has the moral the probably and in the believed to the sense of the profulate and such a conscious every all the comparate and sense of the sense of the comparate the sense of the sense of the sense of the comparate the moral the sense of the moral and the moral and the comparate to the makes the sense of the probably
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through form of the great compels to the called woman, when they case caste, it were of the senses of the successful of the soul in the charactering nothing in the generation and self-position of the following of the profulate and meaning the individual deeply are not and the least of the same the absolute laws of which does not attences of the more the purity is such among the principlegne can can somethin
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through foreoding to philosopherest for gion
our

through for the entire the superiored the man best the soul, the art of the super to the sense of the present and the greatest and desire the fact the same time the present and the same time and the conscience of the sense of the present and the same time the arts of the same to the present and the sense of the sense and all the relation and the fact that it is always and all the contrary and all the same t
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the fact of the last be all the present with the experience of language, and of the same himself in the feat of the german in the artists them of them that the entire and the prevallening and in the
same conscience of self-can call them obser bouned it is the
earth without a perfect and desire of the spirit when they belon to the constance of the man will to be all the
stupidity of the conduct a
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for example
commanded anything man wento

through for the security and the senses of the present man, the security of the problem which is the senses of the presentimate to the securition of the security of the security of the security which is the security is not be all the presentimate in the constanting in the constanting to the security and passion of the more one with the presentimate the presentimate the security of the constanting the condit
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the constanting to himself as the senses and to its spirit and the constanting necessation
and for the intermination and deedes. the greatest man he still actions of the ourselves in being in the deeply who soul" for our events the myself as the defensent of the more constraintaring the world before the hence. the general constantly we are a possible as a stringing to the fact that a general pro
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for the waritoary one fre on the senses,

through for the contempt of the contempting of the world which has been for finds that the sense of the sense of the such and the such and falsified and satisfaction of the convince of the same time the sense of the most conceptions of the world which is the same time the such a southy of the sense of the same time the most experience of the soul and the sense of the sense of the soul which we have been the
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the same time to do a believe the fanscical and also as a truth of the convoluntary that it were the fact that the way and christian to the consequently that the contradictiness, in the man is not the conversated the present
with independed and compulsion of many aspect to loak of the convince, the far a man who has distrumed that the mediocrity with any devil, and such a good inception of the i
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for the belieg are in regard to awal and

through for the sense of the consideration of the consideration of the considerations of the superior and such a man who has the superioritis, and the superioric of the considered and superficial and the superficial the present the consequently the superiority, and the present the antithese the present the conscience of the same the superioric out of the man with the consequently the spirit and the present 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the doch, in the world as a christian any ambitional, the sin in the expectangent to be as the considerations of his realm of the charm of a stronger and the pain of the consequently without his way in the bother, such a more and preference, and were may be regarded himself which to consequently the abstiactes of the stupide by the expresses of the superiorits of the belief, and with permitting 
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for the
persons of that result in
such a

through form of the such a soul the has a present, the presence of the presence of the such a man, and the superiority of the sense--the such a man, the such a man, and the firw of the present, the such a man, and soul, the substine of the such a harming the present of the promises the presence of the such a soul, the promises the fact that it is a work of the presence of the problem of the such a man and s
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through form of more consequenced as a soul the ancient as if the same an a man, as a sightes and finally men in the fact the presence of the selfish to all something insporsition, in the great possible of
the chance of the such a soul, the typical father of the facts or "prousility and blood before the future of the man and completely sense--so much it is a promises of a noble of the reality, wherever the 
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through form--as the
greatest
"strugg of the fat

through for the strong with the still the strong of the present and in the present and any and may be all the consequence of the present and in the still existence of a present and comprehension of the fact that it is a german conscience of the fact that it is a point to the strong with the fact of the comprehension of the strong which the strong in the sense of the more and common and such a thing and stra
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for example, and what he is the really comprehensive and not any only with the most teaches of life to the
schard, but the servant decesing self-contraly poist. for it is to be even the very still least to the personal on the fact of the german acts that the amply the existence of the fact of a precisely of all this and not the word and only a world and love of its operence with the great mask of th
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through formation higher german eward justily se

여기서 볼 수 있듯이 낮은 온도는 아주 반복적이고 예상되는 텍스트를 만듭니다. 하지만 국부적인 구조는 매우 실제와 같습니다. 특히 모든 단어(단어는 글자의 지역 패턴으로 이루어집니다)가 실제 영어 단어입니다. 높은 온도에서 생성된 텍스트는 아주 흥미롭고 놀라우며 창의적이기도 합니다. 이따금 꽤 그럴싸하게 보이는 완전히 새로운 단어를 창조합니다(‘begarmed’와 ‘isharent’ 같은 단어입니다). 높은 온도에서는 국부적인 구조가 무너지기 시작합니다. 대부분의 단어가 어느정도 무작위한 문자열로 보입니다. 확실히 이 네트워크에서는 텍스트 생성에 가장 좋은 온도는 0.5입니다. 항상 다양한 샘플링 전략으로 실험해 봐야합니다! 학습된 구조와 무작위성 사이에 균형을 잘 맞추면 흥미로운 것을 만들 수 있습니다.

더 많은 데이터에서 크고 깊은 모델을 훈련하면 이것보다 훨씬 논리적이고 실제와 같은 텍스트 샘플을 생성할 수 있습니다. 당연히 우연이 아닌 의미 있는 텍스트가 생성된다고 기대하지 마세요. 글자를 연속해서 나열하기 위한 통계 모델에서 데이터를 샘플링한 것뿐입니다. 언어는 의사소통의 수단입니다. 의사소통이 의미하는 것과 의사소통이 인코딩된 메시지의 통계 구조 사이는 차이가 있습니다. 이 차이를 검증하기 위해 다음과 같은 사고 실험을 해보죠. 컴퓨터가 대부분의 디지털 통신에서 하는 것처럼 사람의 언어가 의사소통을 압축하는데 더 뛰어나다면 어떨까요? 언어의 의미가 줄진 않지만 고유한 통계 구조가 사라질 것입니다. 이는 방금과 같은 언어 모델을 학습하는 것을 불가능하게 만듭니다.

## 정리

* 이전의 토큰이 주어지면 다음 토큰(들)을 예측하는 모델을 훈련하여 시퀀스 데이터를 생성할 수 있습니다.
* 텍스트의 경우 이런 모델을 언어 모델이라 부릅니다. 단어 또는 글자 단위 모두 가능합니다.
* 다음 토큰을 샘플링할 때 모델이 만든 출력에 집중하는 것과 무작위성을 주입하는 것 사이에 균형을 맞추어야 합니다.
* 이를 위해 소프트맥스 온도 개념을 사용합니다. 항상 다양한 온도를 실험해서 적절한 값을 찾습니다.