import pandas as pd import spacy import streamlit as st from spacy import displacy SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"] DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook." HTML_WRAPPER = """

{}

""" @st.cache(allow_output_mutation=True) def load_model(name): return spacy.load(name) @st.cache(allow_output_mutation=True) def process_text(model_name, text): nlp = load_model(model_name) return nlp(text) st.sidebar.title("Interactive spaCy visualizer") st.sidebar.markdown( """ Process text with [spaCy](https://spacy.io) models and visualize named entities, dependencies and more. Uses spaCy's built-in [displaCy](http://spacy.io/usage/visualizers) visualizer under the hood. """ ) spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES) model_load_state = st.info(f"Loading model '{spacy_model}'...") nlp = load_model(spacy_model) model_load_state.empty() text = st.text_area("Text to analyze", DEFAULT_TEXT) doc = process_text(spacy_model, text) if "parser" in nlp.pipe_names: st.header("Dependency Parse & Part-of-speech tags") st.sidebar.header("Dependency Parse") split_sents = st.sidebar.checkbox("Split sentences", value=True) collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True) collapse_phrases = st.sidebar.checkbox("Collapse phrases") compact = st.sidebar.checkbox("Compact mode") options = { "collapse_punct": collapse_punct, "collapse_phrases": collapse_phrases, "compact": compact, } docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] for sent in docs: html = displacy.render(sent, options=options) # Double newlines seem to mess with the rendering html = html.replace("\n\n", "\n") if split_sents and len(docs) > 1: st.markdown(f"> {sent.text}") st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) if "ner" in nlp.pipe_names: st.header("Named Entities") st.sidebar.header("Named Entities") default_labels = ["PERSON", "ORG", "GPE", "LOC"] labels = st.sidebar.multiselect( "Entity labels", nlp.get_pipe("ner").labels, default_labels ) html = displacy.render(doc, style="ent", options={"ents": labels}) # Newlines seem to mess with the rendering html = html.replace("\n", " ") st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) attrs = ["text", "label_", "start", "end", "start_char", "end_char"] if "entity_linker" in nlp.pipe_names: attrs.append("kb_id_") data = [ [str(getattr(ent, attr)) for attr in attrs] for ent in doc.ents if ent.label_ in labels ] df = pd.DataFrame(data, columns=attrs) st.dataframe(df) if "textcat" in nlp.pipe_names: st.header("Text Classification") st.markdown(f"> {text}") df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score")) st.dataframe(df) vector_size = nlp.meta.get("vectors", {}).get("width", 0) if vector_size: st.header("Vectors & Similarity") st.code(nlp.meta["vectors"]) text1 = st.text_input("Text or word 1", "apple") text2 = st.text_input("Text or word 2", "orange") doc1 = process_text(spacy_model, text1) doc2 = process_text(spacy_model, text2) similarity = doc1.similarity(doc2) if similarity > 0.5: st.success(similarity) else: st.error(similarity) st.header("Token attributes") if st.button("Show token attributes"): attrs = [ "idx", "text", "lemma_", "pos_", "tag_", "dep_", "head", "ent_type_", "ent_iob_", "shape_", "is_alpha", "is_ascii", "is_digit", "is_punct", "like_num", ] data = [[str(getattr(token, attr)) for attr in attrs] for token in doc] df = pd.DataFrame(data, columns=attrs) st.dataframe(df) st.header("JSON Doc") if st.button("Show JSON Doc"): st.json(doc.to_json()) st.header("JSON model meta") if st.button("Show JSON model meta"): st.json(nlp.meta)