In [1]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

Using TensorFlow backend.


In [2]:
def read_20_newgroup_files(path_to_data_directory):
 texts = []
 labels_index = {}
 labels = []
 
 for name in sorted(os.listdir(path_to_data_directory)):
 path = os.path.join(path_to_data_directory,name)
 
 if os.path.isdir(path):
 label_id = len(labels_index)
 labels_index[name] = label_id
 
 for fname in sorted(os.listdir(path)):
 if fname.isdigit():
 fpath = os.path.join(path, fname)
 if sys.version_info < (3,):
 f = open(fpath)
 else:
 f = open(fpath, encoding='latin-1')
 t = f.read()
 i = t.find('\n\n') +2 # skip header
 
 if i > 0:
 t = t[i:]
 
 texts.append(t)
 f.close()
 labels.append(label_id) 
 
 
 return (texts,labels_index,labels)

path = "/home/felipe/data/20_newsgroup/20_newsgroup/"
 
texts,labels_index,labels = read_20_newgroup_files(path) 

In [3]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
X_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [4]:
GLOVE_DIR = "/media/felipe/SAMSUNG/GloVe"
EMBEDDING_DIM = 100
embeddings_index = {}

with open(os.path.join(GLOVE_DIR,"glove.6B.{0}d.txt".format(EMBEDDING_DIM)),'r') as f:
 for line in f:
 values = line.split()
 word = values[0]
 coefs = np.asarray(values[1:],dtype='float32')

 embeddings_index[word] = coefs

In [5]:
len(word_index)

174074

In [6]:
embedding_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))

for word,i in word_index.items():
 
 if i >= MAX_NB_WORDS:
 continue
 
 embedding_vector = embeddings_index.get(word)
 
 if embedding_vector is not None:
 embedding_matrix[i] = embedding_vector

In [7]:
embedding_layer = Embedding(len(word_index)+1,
 EMBEDDING_DIM,
 weights=[embedding_matrix],
 input_length=MAX_SEQUENCE_LENGTH,
 trainable = False)

In [8]:

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

In [9]:
model = Model(sequence_input, preds)
model.compile(loss ='categorical_crossentropy',
 optimizer='rmsprop',
 metrics=['acc'])

In [10]:
model.fit(X_train,y_train, validation_data=(X_val, y_val),
 epochs=20, batch_size=128)

Train on 15998 samples, validate on 3999 samples
Epoch 1/20

KeyboardInterrupt: 