forked from KEMT/zpwiki
		
	
		
			
				
	
	
		
			80 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			80 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Visualize the data with Streamlit and spaCy.
 | 
						|
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
 | 
						|
 | 
						|
Usage example:
 | 
						|
$ streamlit run visualizer.py visualize ./dataset.jsonl
 | 
						|
"""
 | 
						|
import streamlit as st
 | 
						|
from spacy import displacy
 | 
						|
import srsly
 | 
						|
import sys
 | 
						|
import plac
 | 
						|
from wasabi import msg
 | 
						|
 | 
						|
@plac.annotations(
 | 
						|
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 | 
						|
)
 | 
						|
def visualize(
 | 
						|
    dataset_path
 | 
						|
):
 | 
						|
    FILES = [sys.argv[1]]
 | 
						|
    MISC = "MISC"
 | 
						|
 | 
						|
    HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
 | 
						|
    HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
 | 
						|
    SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
 | 
						|
 | 
						|
    @st.cache(allow_output_mutation=True)
 | 
						|
    def load_data(filepath):
 | 
						|
        return list(srsly.read_jsonl(filepath))
 | 
						|
 | 
						|
    st.sidebar.title("Data visualizer")
 | 
						|
    st.sidebar.markdown(
 | 
						|
        "Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
 | 
						|
        "and view stats about the datasets."
 | 
						|
    )
 | 
						|
    data_file = st.sidebar.selectbox("Dataset", FILES)
 | 
						|
    data = load_data(data_file)
 | 
						|
    n_no_ents = 0
 | 
						|
    n_total_ents = 0
 | 
						|
    accepted = 0
 | 
						|
    rejected = 0
 | 
						|
 | 
						|
    st.header(f"Dataset: {data_file} ({len(data)})")
 | 
						|
    st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
 | 
						|
    for eg in data:
 | 
						|
        if eg["answer"] == "accept":
 | 
						|
            accepted += 1
 | 
						|
        if eg["answer"] != "accept":
 | 
						|
            rejected += 1
 | 
						|
            continue
 | 
						|
        row = {"text": eg["text"], "ents": eg.get("spans", [])}
 | 
						|
        answer = {"answer": eg.get("answer", [])}
 | 
						|
        n_total_ents += len(row["ents"])
 | 
						|
        if not row["ents"]:
 | 
						|
            n_no_ents += 1
 | 
						|
        html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
 | 
						|
        st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
 | 
						|
 | 
						|
    st.sidebar.markdown(
 | 
						|
        f"""        
 | 
						|
    | `{data_file}` | |
 | 
						|
    | --- | ---: |
 | 
						|
    | Total examples | {len(data):,} |
 | 
						|
    | Accepted examples | {accepted:,} |
 | 
						|
    | Rejected examples | {rejected:,} |
 | 
						|
    | Total entities | {n_total_ents:,} |
 | 
						|
    | Examples with no entities | {n_no_ents:,} |
 | 
						|
    """, unsafe_allow_html=True
 | 
						|
    )
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    opts = {"visualize": visualize}
 | 
						|
    cmd = sys.argv.pop(1)
 | 
						|
    if cmd not in opts:
 | 
						|
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
 | 
						|
    try:
 | 
						|
        plac.call(opts[cmd])
 | 
						|
    except KeyboardInterrupt:
 | 
						|
        msg.warn("Stopped.", exits=1) |