# coding: utf8 from __future__ import unicode_literals import hug from hug_middleware_cors import CORSMiddleware import waitress import spacy import plac MODELS = {} @plac.annotations( models=("Comma-separated list of spaCy models", "positional", None, str), host=("Host to serve API", "option", "ho", str), port=("Port to serve API", "option", "p", int), ) def main(models=None, host="0.0.0.0", port=8080): if not models: models = ["en_core_web_sm"] else: models = [m.strip() for m in models.split(",")] for model in models: load_model(model) # Serving Hug API app = hug.API(__name__) app.http.add_middleware(CORSMiddleware(app)) waitress.serve(__hug_wsgi__, port=port) def load_model(model): print("Loading model '{}'...".format(model)) MODELS[model] = spacy.load(model) def doc2json(doc: spacy.tokens.Doc, model: str): json_doc = { "text": doc.text, "text_with_ws": doc.text_with_ws, "cats": doc.cats, "is_tagged": doc.is_tagged, "is_parsed": doc.is_parsed, # "is_nered": doc.is_nered, "is_sentenced": doc.is_sentenced, } ents = [ {"start": ent.start, "end": ent.end, "label": ent.label_} for ent in doc.ents ] if doc.is_sentenced: sents = [{"start": sent.start, "end": sent.end} for sent in doc.sents] else: sents = [] if doc.is_tagged and doc.is_parsed: noun_chunks = [ {"start": chunk.start, "end": chunk.end} for chunk in doc.noun_chunks ] else: noun_chunks = [] tokens = [ { "text": token.text, "text_with_ws": token.text_with_ws, "whitespace": token.whitespace_, "orth": token.orth, "i": token.i, "ent_type": token.ent_type_, "ent_iob": token.ent_iob_, "lemma": token.lemma_, "norm": token.norm_, "lower": token.lower_, "shape": token.shape_, "prefix": token.prefix_, "suffix": token.suffix_, "pos": token.pos_, "tag": token.tag_, "dep": token.dep_, "is_alpha": token.is_alpha, "is_ascii": token.is_ascii, "is_digit": token.is_digit, "is_lower": token.is_lower, "is_upper": token.is_upper, "is_title": token.is_title, "is_punct": token.is_punct, "is_left_punct": token.is_left_punct, "is_right_punct": token.is_right_punct, "is_space": token.is_space, "is_bracket": token.is_bracket, "is_currency": token.is_currency, "like_url": token.like_url, "like_num": token.like_num, "like_email": token.like_email, "is_oov": token.is_oov, "is_stop": token.is_stop, "is_sent_start": token.is_sent_start, "head": token.head.i, } for token in doc ] return { "model": model, "doc": json_doc, "ents": ents, "sents": sents, "noun_chunks": noun_chunks, "tokens": tokens, } @hug.post("/parse") def parse(model: str, text: str): nlp = MODELS[model] doc = nlp(text) return doc2json(doc, model) @hug.post("/similarity") def similarity(model: str, text1: str, text2: str): # We can always create Doc objects here, because the result is the same nlp = MODELS[model] doc1 = nlp(text1) doc2 = nlp(text2) return {"similarity": doc1.similarity(doc2)} if __name__ == "__main__": plac.call(main)