# Import basic libraries and keras import os import json import keras import numpy as np import keras.preprocessing.text as kpt from keras.layers import Dense, Dropout from keras.preprocessing.text import Tokenizer from keras.models import Sequential, load_model # Load input data training = np.genfromtxt('15000tweets.csv', delimiter=',', skip_header=1, usecols=(1, 3), dtype=None, encoding='utf-8') # Get tweets and sentiments (0 or 1) train_x = [str(x[1]) for x in training] train_y = np.asarray([x[0] for x in training]) # Use the 3000 most popular words found in our dataset max_words = 3000 # Tokenize the data tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(train_x) dictionary = tokenizer.word_index # Save tokenizer dictionary to file if not os.path.exists('dictionary.json'): with open('dictionary.json', 'w') as outfile: json.dump(tokenizer.word_index, outfile) # For each tweet, change each token to its ID in the Tokenizer's word_index allWordIndices = [] for text in train_x: words = kpt.text_to_word_sequence(text) wordIndices = [dictionary[word] for word in words] allWordIndices.append(wordIndices) # Create matrix with indexed tweets and categorical target train_x = tokenizer.sequences_to_matrix(np.asarray(allWordIndices), mode='binary') train_y = keras.utils.to_categorical(train_y, 2) # Check if there is a pre-trained model if not os.path.exists('model.h5'): # Create a neural network with 3 dense layers model = Sequential() model.add(Dense(512, input_shape=(max_words,), activation='relu')) model.add(Dropout(0.5)) model.add(Dense(256, activation='sigmoid')) model.add(Dropout(0.5)) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model model.fit(train_x, train_y, batch_size=32, epochs=5, verbose=1, validation_split=0.1, shuffle=True) # Save the model model.save('model.h5') else: # Load the model from disk model = load_model('model.h5')