|
|
import streamlit as st |
|
|
import textract |
|
|
import tempfile |
|
|
import spacy |
|
|
from spacy.tokens import DocBin |
|
|
import srsly |
|
|
|
|
|
st.title('Index and Search a Collection of Documents') |
|
|
|
|
|
@st.cache |
|
|
def download_model(select_model:str): |
|
|
with st.spinner(f'Loading model {select_model}'): |
|
|
spacy.cli.download(select_model) |
|
|
return True |
|
|
|
|
|
doc_bin = DocBin() |
|
|
models = srsly.read_json('models.json') |
|
|
models[''] = [] |
|
|
languages = models.keys() |
|
|
language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.") |
|
|
if language: |
|
|
select_model = st.selectbox("Model", models[language], help="spaCy model") |
|
|
if select_model: |
|
|
model_downloaded = download_model(select_model) |
|
|
|
|
|
if model_downloaded: |
|
|
|
|
|
nlp = spacy.load(select_model) |
|
|
|
|
|
nlp.max_length = 1200000 |
|
|
|
|
|
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) |
|
|
|
|
|
for uploaded_file in uploaded_files: |
|
|
file_type = uploaded_file.type |
|
|
|
|
|
temp = tempfile.NamedTemporaryFile() |
|
|
temp.write(uploaded_file.getvalue()) |
|
|
try: |
|
|
text = textract.process(temp.name) |
|
|
text = text.decode('utf-8') |
|
|
doc = nlp(text) |
|
|
st.write(text) |
|
|
except Exception as e: |
|
|
st.error(e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|