forked from KEMT/zpwiki
80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
"""
|
|
Visualize the data with Streamlit and spaCy.
|
|
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
|
|
|
|
Usage example:
|
|
$ streamlit run visualizer.py visualize ./dataset.jsonl
|
|
"""
|
|
import streamlit as st
|
|
from spacy import displacy
|
|
import srsly
|
|
import sys
|
|
import plac
|
|
from wasabi import msg
|
|
|
|
@plac.annotations(
|
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
|
)
|
|
def visualize(
|
|
dataset_path
|
|
):
|
|
FILES = [sys.argv[1]]
|
|
MISC = "MISC"
|
|
|
|
HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
|
|
HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
|
|
SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
|
|
|
|
@st.cache(allow_output_mutation=True)
|
|
def load_data(filepath):
|
|
return list(srsly.read_jsonl(filepath))
|
|
|
|
st.sidebar.title("Data visualizer")
|
|
st.sidebar.markdown(
|
|
"Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
|
|
"and view stats about the datasets."
|
|
)
|
|
data_file = st.sidebar.selectbox("Dataset", FILES)
|
|
data = load_data(data_file)
|
|
n_no_ents = 0
|
|
n_total_ents = 0
|
|
accepted = 0
|
|
rejected = 0
|
|
|
|
st.header(f"Dataset: {data_file} ({len(data)})")
|
|
st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
|
|
for eg in data:
|
|
if eg["answer"] == "accept":
|
|
accepted += 1
|
|
if eg["answer"] != "accept":
|
|
rejected += 1
|
|
continue
|
|
row = {"text": eg["text"], "ents": eg.get("spans", [])}
|
|
answer = {"answer": eg.get("answer", [])}
|
|
n_total_ents += len(row["ents"])
|
|
if not row["ents"]:
|
|
n_no_ents += 1
|
|
html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
|
|
st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
|
|
|
st.sidebar.markdown(
|
|
f"""
|
|
| `{data_file}` | |
|
|
| --- | ---: |
|
|
| Total examples | {len(data):,} |
|
|
| Accepted examples | {accepted:,} |
|
|
| Rejected examples | {rejected:,} |
|
|
| Total entities | {n_total_ents:,} |
|
|
| Examples with no entities | {n_no_ents:,} |
|
|
""", unsafe_allow_html=True
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
opts = {"visualize": visualize}
|
|
cmd = sys.argv.pop(1)
|
|
if cmd not in opts:
|
|
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
|
|
try:
|
|
plac.call(opts[cmd])
|
|
except KeyboardInterrupt:
|
|
msg.warn("Stopped.", exits=1) |