""" Visualize the data with Streamlit and spaCy. https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py Usage example: $ streamlit run visualizer.py visualize ./dataset.jsonl """ import streamlit as st from spacy import displacy import srsly import sys import plac from wasabi import msg @plac.annotations( dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str), ) def visualize( dataset_path ): FILES = [sys.argv[1]] MISC = "MISC" HTML_WRAPPER = "
{}
" HTML_WRAPPER1 = "
{}
" SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}} @st.cache(allow_output_mutation=True) def load_data(filepath): return list(srsly.read_jsonl(filepath)) st.sidebar.title("Data visualizer") st.sidebar.markdown( "Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) " "and view stats about the datasets." ) data_file = st.sidebar.selectbox("Dataset", FILES) data = load_data(data_file) n_no_ents = 0 n_total_ents = 0 accepted = 0 rejected = 0 st.header(f"Dataset: {data_file} ({len(data)})") st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True) for eg in data: if eg["answer"] == "accept": accepted += 1 if eg["answer"] != "accept": rejected += 1 continue row = {"text": eg["text"], "ents": eg.get("spans", [])} answer = {"answer": eg.get("answer", [])} n_total_ents += len(row["ents"]) if not row["ents"]: n_no_ents += 1 html = displacy.render(row, **SETTINGS).replace("\n\n", "\n") st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True) st.sidebar.markdown( f""" | `{data_file}` | | | --- | ---: | | Total examples | {len(data):,} | | Accepted examples | {accepted:,} | | Rejected examples | {rejected:,} | | Total entities | {n_total_ents:,} | | Examples with no entities | {n_no_ents:,} | """, unsafe_allow_html=True ) if __name__ == "__main__": opts = {"visualize": visualize} cmd = sys.argv.pop(1) if cmd not in opts: msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1) try: plac.call(opts[cmd]) except KeyboardInterrupt: msg.warn("Stopped.", exits=1)