"""Fork of Sentiment-Analyzer-Tool by Paras Patidar. Improvements by Marc Skov Madsen Original Source: https://github.com/patidarparas13/Sentiment-Analyzer-Tool Original Author: https://github.com/patidarparas13, """ import itertools import matplotlib.pyplot as plt import numpy as np import pandas as pd import requests import streamlit as st from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import confusion_matrix from sklearn.naive_bayes import BernoulliNB ROOT_URL = "https://raw.githubusercontent.com/MarcSkovMadsen/awesome-streamlit/master/gallery/sentiment_analyzer/data/" FILES = ["imdb_labelled.txt", "amazon_cells_labelled.txt", "yelp_labelled.txt"] @st.cache def get_all_data(): """Loads the source data""" data = [] for file in FILES: data += requests.get(ROOT_URL + file).text.split("\n") return data @st.cache def preprocessing_data(data): processing_data = [] for single_data in data: if len(single_data.split("\t")) == 2 and single_data.split("\t")[1] != "": processing_data.append(single_data.split("\t")) return processing_data @st.cache def split_data(data): total = len(data) training_ratio = 0.75 training_data = [] evaluation_data = [] for indice in range(0, total): if indice < total * training_ratio: training_data.append(data[indice]) else: evaluation_data.append(data[indice]) return training_data, evaluation_data @st.cache def preprocessing_step(): data = get_all_data() processing_data = preprocessing_data(data) return split_data(processing_data) def training_step(data, vectorizer): training_text = [data[0] for data in data] training_result = [data[1] for data in data] training_text = vectorizer.fit_transform(training_text) return BernoulliNB().fit(training_text, training_result) def analyse_text(classifier, vectorizer, text): return text, classifier.predict(vectorizer.transform([text])) def print_result(result): text, analysis_result = result print_text = "Positive" if analysis_result[0] == "1" else "Negative" return text, print_text st.title("Sentiment Algorithm") st.info( "This is an **improved version** of the " "awesome [**original**](https://github.com/patidarparas13/Sentiment-Analyzer-Tool) " "developed by awesome [**Paras Patidar**](https://github.com/patidarparas13). Kudos!\n\n" ) st.write( "The algorithm is trained on a collection of movie reviews and you can test it below." ) st.subheader("Extract the data") file_markdown = "Source: \n" file_markdown += ", ".join([f"[{file}]({ROOT_URL+file})" for file in FILES]) st.markdown(file_markdown) with st.spinner("Extracting source data..."): all_data = get_all_data() source_data = pd.DataFrame( preprocessing_data(all_data), columns=["review", "sentiment"] ) source_data["sentiment"] = source_data["sentiment"].map( {"0": "Negative", "1": "Positive"} ) st.info(f"{len(source_data)} rows where extract with **succes**!") top = st.selectbox( "Select number of rows to show", [5, 10, 25, 50, 100, len(source_data)] ) st.table(source_data.head(top)) st.subheader("Train the algorithm") with st.spinner("Training algorithm..."): training_data, evaluation_data = preprocessing_step() vectorizer = CountVectorizer(binary="true") classifier = training_step(training_data, vectorizer) st.info("The algorithm was trained with **success**!") st.title("Try the algorithm here!") write_here = "Write Here..." review = st.text_input("Enter a review for classification by the algorithm", write_here) if st.button("Predict Sentiment"): result = print_result(analyse_text(classifier, vectorizer, review)) if review != write_here: st.success(result[1]) st.error("For illustrative purposes only! :-)") else: st.error("You need to input a review for classification!") else: st.info( "**Enter a review** above and **press the button** to predict the sentiment." )