80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
|
"""
|
||
|
Visualize the data with Streamlit and spaCy.
|
||
|
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
|
||
|
|
||
|
Usage example:
|
||
|
$ streamlit run visualizer.py visualize ./dataset.jsonl
|
||
|
"""
|
||
|
import streamlit as st
|
||
|
from spacy import displacy
|
||
|
import srsly
|
||
|
import sys
|
||
|
import plac
|
||
|
from wasabi import msg
|
||
|
|
||
|
@plac.annotations(
|
||
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
||
|
)
|
||
|
def visualize(
|
||
|
dataset_path
|
||
|
):
|
||
|
FILES = [sys.argv[1]]
|
||
|
MISC = "MISC"
|
||
|
|
||
|
HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
|
||
|
HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
|
||
|
SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
|
||
|
|
||
|
@st.cache(allow_output_mutation=True)
|
||
|
def load_data(filepath):
|
||
|
return list(srsly.read_jsonl(filepath))
|
||
|
|
||
|
st.sidebar.title("Data visualizer")
|
||
|
st.sidebar.markdown(
|
||
|
"Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
|
||
|
"and view stats about the datasets."
|
||
|
)
|
||
|
data_file = st.sidebar.selectbox("Dataset", FILES)
|
||
|
data = load_data(data_file)
|
||
|
n_no_ents = 0
|
||
|
n_total_ents = 0
|
||
|
accepted = 0
|
||
|
rejected = 0
|
||
|
|
||
|
st.header(f"Dataset: {data_file} ({len(data)})")
|
||
|
st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
|
||
|
for eg in data:
|
||
|
if eg["answer"] == "accept":
|
||
|
accepted += 1
|
||
|
if eg["answer"] != "accept":
|
||
|
rejected += 1
|
||
|
continue
|
||
|
row = {"text": eg["text"], "ents": eg.get("spans", [])}
|
||
|
answer = {"answer": eg.get("answer", [])}
|
||
|
n_total_ents += len(row["ents"])
|
||
|
if not row["ents"]:
|
||
|
n_no_ents += 1
|
||
|
html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
|
||
|
st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||
|
|
||
|
st.sidebar.markdown(
|
||
|
f"""
|
||
|
| `{data_file}` | |
|
||
|
| --- | ---: |
|
||
|
| Total examples | {len(data):,} |
|
||
|
| Accepted examples | {accepted:,} |
|
||
|
| Rejected examples | {rejected:,} |
|
||
|
| Total entities | {n_total_ents:,} |
|
||
|
| Examples with no entities | {n_no_ents:,} |
|
||
|
""", unsafe_allow_html=True
|
||
|
)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
opts = {"visualize": visualize}
|
||
|
cmd = sys.argv.pop(1)
|
||
|
if cmd not in opts:
|
||
|
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
|
||
|
try:
|
||
|
plac.call(opts[cmd])
|
||
|
except KeyboardInterrupt:
|
||
|
msg.warn("Stopped.", exits=1)
|