forked from KEMT/zpwiki
		
	
		
			
				
	
	
		
			80 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			80 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Visualize the data with Streamlit and spaCy.
 | |
| https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
 | |
| 
 | |
| Usage example:
 | |
| $ streamlit run visualizer.py visualize ./dataset.jsonl
 | |
| """
 | |
| import streamlit as st
 | |
| from spacy import displacy
 | |
| import srsly
 | |
| import sys
 | |
| import plac
 | |
| from wasabi import msg
 | |
| 
 | |
| @plac.annotations(
 | |
|     dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 | |
| )
 | |
| def visualize(
 | |
|     dataset_path
 | |
| ):
 | |
|     FILES = [sys.argv[1]]
 | |
|     MISC = "MISC"
 | |
| 
 | |
|     HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
 | |
|     HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
 | |
|     SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
 | |
| 
 | |
|     @st.cache(allow_output_mutation=True)
 | |
|     def load_data(filepath):
 | |
|         return list(srsly.read_jsonl(filepath))
 | |
| 
 | |
|     st.sidebar.title("Data visualizer")
 | |
|     st.sidebar.markdown(
 | |
|         "Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
 | |
|         "and view stats about the datasets."
 | |
|     )
 | |
|     data_file = st.sidebar.selectbox("Dataset", FILES)
 | |
|     data = load_data(data_file)
 | |
|     n_no_ents = 0
 | |
|     n_total_ents = 0
 | |
|     accepted = 0
 | |
|     rejected = 0
 | |
| 
 | |
|     st.header(f"Dataset: {data_file} ({len(data)})")
 | |
|     st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
 | |
|     for eg in data:
 | |
|         if eg["answer"] == "accept":
 | |
|             accepted += 1
 | |
|         if eg["answer"] != "accept":
 | |
|             rejected += 1
 | |
|             continue
 | |
|         row = {"text": eg["text"], "ents": eg.get("spans", [])}
 | |
|         answer = {"answer": eg.get("answer", [])}
 | |
|         n_total_ents += len(row["ents"])
 | |
|         if not row["ents"]:
 | |
|             n_no_ents += 1
 | |
|         html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
 | |
|         st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
 | |
| 
 | |
|     st.sidebar.markdown(
 | |
|         f"""        
 | |
|     | `{data_file}` | |
 | |
|     | --- | ---: |
 | |
|     | Total examples | {len(data):,} |
 | |
|     | Accepted examples | {accepted:,} |
 | |
|     | Rejected examples | {rejected:,} |
 | |
|     | Total entities | {n_total_ents:,} |
 | |
|     | Examples with no entities | {n_no_ents:,} |
 | |
|     """, unsafe_allow_html=True
 | |
|     )
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     opts = {"visualize": visualize}
 | |
|     cmd = sys.argv.pop(1)
 | |
|     if cmd not in opts:
 | |
|         msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
 | |
|     try:
 | |
|         plac.call(opts[cmd])
 | |
|     except KeyboardInterrupt:
 | |
|         msg.warn("Stopped.", exits=1) |