"""
Visualize the data with Streamlit and spaCy.
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
Usage example:
$ streamlit run visualizer.py visualize ./dataset.jsonl
"""
import streamlit as st
from spacy import displacy
import srsly
import sys
import plac
from wasabi import msg
@plac.annotations(
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
)
def visualize(
dataset_path
):
FILES = [sys.argv[1]]
MISC = "MISC"
HTML_WRAPPER = "
{}
"
HTML_WRAPPER1 = "{}
"
SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
@st.cache(allow_output_mutation=True)
def load_data(filepath):
return list(srsly.read_jsonl(filepath))
st.sidebar.title("Data visualizer")
st.sidebar.markdown(
"Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
"and view stats about the datasets."
)
data_file = st.sidebar.selectbox("Dataset", FILES)
data = load_data(data_file)
n_no_ents = 0
n_total_ents = 0
accepted = 0
rejected = 0
st.header(f"Dataset: {data_file} ({len(data)})")
st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
for eg in data:
if eg["answer"] == "accept":
accepted += 1
if eg["answer"] != "accept":
rejected += 1
continue
row = {"text": eg["text"], "ents": eg.get("spans", [])}
answer = {"answer": eg.get("answer", [])}
n_total_ents += len(row["ents"])
if not row["ents"]:
n_no_ents += 1
html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
st.sidebar.markdown(
f"""
| `{data_file}` | |
| --- | ---: |
| Total examples | {len(data):,} |
| Accepted examples | {accepted:,} |
| Rejected examples | {rejected:,} |
| Total entities | {n_total_ents:,} |
| Examples with no entities | {n_no_ents:,} |
""", unsafe_allow_html=True
)
if __name__ == "__main__":
opts = {"visualize": visualize}
cmd = sys.argv.pop(1)
if cmd not in opts:
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
try:
plac.call(opts[cmd])
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)