File size: 1,526 Bytes
4c042d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import streamlit as st
import textract
import tempfile
import spacy
from spacy.tokens import DocBin
import srsly

st.title('Index and Search a Collection of Documents')

@st.cache
def download_model(select_model:str):
    with st.spinner(f'Loading model {select_model}'):
        spacy.cli.download(select_model)
    return True

doc_bin = DocBin()
models = srsly.read_json('models.json')
models[''] = [] #require the user to choose a language
languages = models.keys()
language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.")
if language:
    select_model = st.selectbox("Model", models[language], help="spaCy model")
    if select_model:
        model_downloaded = download_model(select_model)

        if model_downloaded:

            nlp = spacy.load(select_model)

            nlp.max_length = 1200000

            uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)

            for uploaded_file in uploaded_files:
                file_type = uploaded_file.type

                temp = tempfile.NamedTemporaryFile()
                temp.write(uploaded_file.getvalue())
                try:
                    text = textract.process(temp.name)
                    text = text.decode('utf-8')
                    doc = nlp(text)
                    st.write(text)
                except Exception as e:
                    st.error(e)
                

            #st.download_button('Download', '', 'text/plain')