import streamlit as st

st.set_page_config(
    page_title="About",
    page_icon="👋",
    layout="wide"
)

st.image("cleaning_overview.png")
st.title("Document Image Cleaning for Black-Box OCR Engines")
st.sidebar.success("Select a page.")


st.markdown(
    """
    Black-Box open-source OCR engines ([Tesseract](https://github.com/tesseract-ocr/tesseract)) and commercial OCR APIs ([Google Vision API](https://cloud.google.com/vision/docs/ocr)) 
    are difficult to retrain with new data. We can [train a document image preprocessor](https://arxiv.org/abs/2105.07983)
    for black-box OCR engines by approximating the gradient of the black-box using a proxy model. However, the OCR engine
    needs to be queried for all samples which is computationally/financially [expensive](https://cloud.google.com/vision/pricing). Here, we show that the documents
    can be preprocessed using just 4% of the total OCR queries. 
    
    👈 Select  **Denoise** in the sidebar to see document preprocessing with 100\%, 8\% and 4\% OCR query budget. 
    """
)

    # Want to learn more?
    # - Check out [streamlit.io](https://streamlit.io)
    # - Jump into our [documentation](https://docs.streamlit.io)
    # - Ask a question in our [community
    #     forums](https://discuss.streamlit.io)
    # ### See more complex demos
    # - Use a neural net to [analyze the Udacity Self-driving Car Image
    #     Dataset](https://github.com/streamlit/demo-self-driving)
    # - Explore a [New York City rideshare dataset](https://github.com/streamlit/demo-uber-nyc-pickups)