zpwiki/pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py

"""
Visualize the data with Streamlit and spaCy.
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py

Usage example:
$ streamlit run visualizer.py visualize ./dataset.jsonl
"""
import streamlit as st
from spacy import displacy
import srsly
import sys
import plac
from wasabi import msg

@plac.annotations(
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
)
def visualize(
    dataset_path
):
    FILES = [sys.argv[1]]
    MISC = "MISC"

    HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
    HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
    SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}

    @st.cache(allow_output_mutation=True)
    def load_data(filepath):
        return list(srsly.read_jsonl(filepath))

    st.sidebar.title("Data visualizer")
    st.sidebar.markdown(
        "Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
        "and view stats about the datasets."
    )
    data_file = st.sidebar.selectbox("Dataset", FILES)
    data = load_data(data_file)
    n_no_ents = 0
    n_total_ents = 0
    accepted = 0
    rejected = 0

    st.header(f"Dataset: {data_file} ({len(data)})")
    st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
    for eg in data:
        if eg["answer"] == "accept":
            accepted += 1
        if eg["answer"] != "accept":
            rejected += 1
            continue
        row = {"text": eg["text"], "ents": eg.get("spans", [])}
        answer = {"answer": eg.get("answer", [])}
        n_total_ents += len(row["ents"])
        if not row["ents"]:
            n_no_ents += 1
        html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
        st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)

    st.sidebar.markdown(
        f"""
    | `{data_file}` | |
    | --- | ---: |
    | Total examples | {len(data):,} |
    | Accepted examples | {accepted:,} |
    | Rejected examples | {rejected:,} |
    | Total entities | {n_total_ents:,} |
    | Examples with no entities | {n_no_ents:,} |
    """, unsafe_allow_html=True
    )

if __name__ == "__main__":
    opts = {"visualize": visualize}
    cmd = sys.argv.pop(1)
    if cmd not in opts:
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
    try:
        plac.call(opts[cmd])
    except KeyboardInterrupt:
        msg.warn("Stopped.", exits=1)