bolete / app.py
apjanco's picture
readme syntax
f62f59a
raw
history blame
1.53 kB
import streamlit as st
import textract
import tempfile
import spacy
from spacy.tokens import DocBin
import srsly
st.title('Index and Search a Collection of Documents')
@st.cache
def download_model(select_model:str):
with st.spinner(f'Loading model {select_model}'):
spacy.cli.download(select_model)
return True
doc_bin = DocBin()
models = srsly.read_json('models.json')
models[''] = [] #require the user to choose a language
languages = models.keys()
language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.")
if language:
select_model = st.selectbox("Model", models[language], help="spaCy model")
if select_model:
model_downloaded = download_model(select_model)
if model_downloaded:
nlp = spacy.load(select_model)
nlp.max_length = 1200000
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
for uploaded_file in uploaded_files:
file_type = uploaded_file.type
temp = tempfile.NamedTemporaryFile()
temp.write(uploaded_file.getvalue())
try:
text = textract.process(temp.name)
text = text.decode('utf-8')
doc = nlp(text)
st.write(text)
except Exception as e:
st.error(e)
#st.download_button('Download', '', 'text/plain')