Spaces:
Running
Running
Commit
·
a145e37
1
Parent(s):
94dfdfd
req
Browse files- app.py +145 -77
- vouchervision/OCR_Florence_2.py +41 -15
- vouchervision/OCR_GPT4oMini.py +94 -0
- vouchervision/OCR_google_cloud_vision.py +63 -41
- vouchervision/VoucherVision_Config_Builder.py +6 -3
- vouchervision/general_utils.py +23 -6
- vouchervision/model_maps.py +34 -14
- vouchervision/utils_LLM.py +25 -9
- vouchervision/utils_VoucherVision.py +16 -3
- vouchervision/vouchervision_main.py +2 -2
app.py
CHANGED
|
@@ -218,10 +218,10 @@ if 'dir_images_local_TEMP' not in st.session_state:
|
|
| 218 |
st.session_state['dir_images_local_TEMP'] = False
|
| 219 |
if 'dir_uploaded_images' not in st.session_state:
|
| 220 |
st.session_state['dir_uploaded_images'] = os.path.join(st.session_state.dir_home,'uploads')
|
| 221 |
-
validate_dir(st.session_state
|
| 222 |
if 'dir_uploaded_images_small' not in st.session_state:
|
| 223 |
st.session_state['dir_uploaded_images_small'] = os.path.join(st.session_state.dir_home,'uploads_small')
|
| 224 |
-
validate_dir(st.session_state
|
| 225 |
|
| 226 |
|
| 227 |
|
|
@@ -264,16 +264,18 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
|
|
| 264 |
|
| 265 |
ind_small = 0
|
| 266 |
for uploaded_file in uploaded_files:
|
|
|
|
| 267 |
if SAFE.check_for_inappropriate_content(uploaded_file):
|
| 268 |
clear_image_uploads()
|
| 269 |
report_violation(uploaded_file.name, is_hf=st.session_state['is_hf'])
|
| 270 |
st.error("Warning: You uploaded an image that violates our terms of service.")
|
|
|
|
| 271 |
|
| 272 |
|
| 273 |
# Determine the file type
|
| 274 |
if uploaded_file.name.lower().endswith('.pdf'):
|
| 275 |
# Handle PDF files
|
| 276 |
-
file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file
|
| 277 |
# Convert each page of the PDF to an image
|
| 278 |
n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
|
| 279 |
# Update the input list for each page image
|
|
@@ -288,27 +290,22 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
|
|
| 288 |
# Optionally, create a thumbnail for the gallery
|
| 289 |
img = Image.open(jpg_file_path)
|
| 290 |
img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
|
| 291 |
-
|
| 292 |
file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], file_name, img)
|
| 293 |
-
|
| 294 |
file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images_small'],st.session_state['dir_uploaded_images_small'], file_name, img)
|
| 295 |
st.session_state['input_list_small'].append(file_path_small)
|
| 296 |
|
| 297 |
else:
|
| 298 |
ind_small += 1
|
| 299 |
# Handle JPG/JPEG files (existing process)
|
| 300 |
-
|
| 301 |
-
# file_path = os.path.join(st.session_state['dir_uploaded_images'], uploaded_file.name)
|
| 302 |
-
image = Image.open(uploaded_file)
|
| 303 |
-
file_path = os.path.join(st.session_state['dir_uploaded_images'], uploaded_file.name)
|
| 304 |
-
image.save(file_path, "JPEG")
|
| 305 |
-
|
| 306 |
st.session_state['input_list'].append(file_path)
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
|
| 313 |
# After processing all files
|
| 314 |
st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
|
|
@@ -396,7 +393,7 @@ def content_input_images(col_left, col_right):
|
|
| 396 |
|
| 397 |
with col_right:
|
| 398 |
if st.session_state.is_hf:
|
| 399 |
-
handle_image_upload_and_gallery_hf(uploaded_files)
|
| 400 |
|
| 401 |
else:
|
| 402 |
st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
|
|
@@ -1767,12 +1764,47 @@ def content_prompt_and_llm_version():
|
|
| 1767 |
st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
|
| 1768 |
|
| 1769 |
|
| 1770 |
-
st.header('LLM Version')
|
| 1771 |
-
col_llm_1, col_llm_2 = st.columns([4,2])
|
| 1772 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1773 |
with col_llm_1:
|
| 1774 |
-
|
| 1775 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1776 |
st.markdown("""
|
| 1777 |
Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
|
| 1778 |
- Any Mistral model e.g., `Mistral Large`
|
|
@@ -1815,25 +1847,43 @@ def content_api_check():
|
|
| 1815 |
|
| 1816 |
|
| 1817 |
|
| 1818 |
-
def adjust_ocr_options_based_on_capability(capability_score):
|
| 1819 |
-
|
| 1820 |
-
|
| 1821 |
-
|
| 1822 |
-
|
| 1823 |
-
|
| 1824 |
-
|
| 1825 |
-
|
| 1826 |
-
|
| 1827 |
-
|
| 1828 |
-
|
| 1829 |
-
|
| 1830 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1831 |
|
| 1832 |
-
|
| 1833 |
-
|
| 1834 |
-
|
| 1835 |
-
|
| 1836 |
-
|
| 1837 |
|
| 1838 |
|
| 1839 |
|
|
@@ -1867,12 +1917,22 @@ def content_ocr_method():
|
|
| 1867 |
|
| 1868 |
c1, c2 = st.columns([4,4])
|
| 1869 |
|
| 1870 |
-
|
| 1871 |
-
|
| 1872 |
-
|
| 1873 |
-
|
| 1874 |
-
|
| 1875 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1876 |
|
| 1877 |
demo_text_h = f"Google_OCR_Handwriting:\nHERBARIUM OF MARCUS W. LYON , JR . Tracaulon sagittatum Indiana : Porter Co. incal Springs edge wet subdunal woods 1927 TX 11 Ilowers pink UNIVERSITE HERBARIUM MICH University of Michigan Herbarium 1439649 copyright reserved PERSICARIA FEB 2 6 1965 cm "
|
| 1878 |
demo_text_tr = f"trOCR:\nherbarium of marcus w. lyon jr. : : : tracaulon sagittatum indiana porter co. incal springs TX 11 Ilowers pink 1439649 copyright reserved D H U Q "
|
|
@@ -1882,7 +1942,7 @@ def content_ocr_method():
|
|
| 1882 |
demo_text_trh = demo_text_h + '\n' + demo_text_tr
|
| 1883 |
demo_text_trp = demo_text_p + '\n' + demo_text_tr
|
| 1884 |
|
| 1885 |
-
options = ["Google Vision Handwritten", "Google Vision Printed", "CRAFT + trOCR","LLaVA",
|
| 1886 |
options_llava = ["llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",]
|
| 1887 |
options_llava_bit = ["full", "4bit",]
|
| 1888 |
captions_llava = [
|
|
@@ -1905,7 +1965,7 @@ def content_ocr_method():
|
|
| 1905 |
default_index_llava_bit = 0
|
| 1906 |
with c1:
|
| 1907 |
st.subheader("API Methods (Google Vision)")
|
| 1908 |
-
st.write("Using APIs for OCR allows VoucherVision to run on most computers.")
|
| 1909 |
|
| 1910 |
st.session_state.config['leafmachine']['project']['double_OCR'] = st.checkbox(label="Send 2 copies of the OCR to the LLM",
|
| 1911 |
help="This can help the LLMs focus attention on the OCR and not get lost in the longer instruction text",
|
|
@@ -1934,6 +1994,7 @@ def content_ocr_method():
|
|
| 1934 |
"CRAFT + trOCR": 'CRAFT',
|
| 1935 |
"LLaVA": 'LLaVA',
|
| 1936 |
"Florence-2": 'Florence-2',
|
|
|
|
| 1937 |
}
|
| 1938 |
|
| 1939 |
# Map selected options to their corresponding internal representations
|
|
@@ -1943,45 +2004,52 @@ def content_ocr_method():
|
|
| 1943 |
st.session_state.config['leafmachine']['project']['OCR_option'] = selected_OCR_options
|
| 1944 |
|
| 1945 |
|
| 1946 |
-
with c2:
|
| 1947 |
-
st.subheader("Local Methods")
|
| 1948 |
-
st.write("Local methods are free, but require a capable GPU. ")
|
| 1949 |
-
|
| 1950 |
|
| 1951 |
-
|
| 1952 |
if 'CRAFT' in selected_OCR_options:
|
| 1953 |
-
|
| 1954 |
-
|
| 1955 |
-
|
| 1956 |
-
|
| 1957 |
-
|
| 1958 |
-
|
| 1959 |
-
|
| 1960 |
-
|
| 1961 |
-
|
| 1962 |
-
|
| 1963 |
-
|
| 1964 |
-
|
| 1965 |
-
|
| 1966 |
-
|
| 1967 |
-
|
|
|
|
|
|
|
|
|
|
| 1968 |
|
| 1969 |
|
| 1970 |
if "Florence-2" in selected_OCR_options:
|
|
|
|
| 1971 |
default_florence_model_path = st.session_state.config['leafmachine']['project']['florence_model_path']
|
| 1972 |
-
user_input_florence_model_path = st.text_input("Florence-2 Hugging Face model path. MUST be a Florence-2 version based on 'microsoft/Florence-2-large' or similar.", value=default_florence_model_path)
|
| 1973 |
|
| 1974 |
-
|
| 1975 |
-
|
| 1976 |
-
|
| 1977 |
-
|
| 1978 |
-
|
| 1979 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1980 |
|
| 1981 |
|
| 1982 |
if 'LLaVA' in selected_OCR_options:
|
|
|
|
| 1983 |
OCR_option_llava = st.radio(
|
| 1984 |
-
"Select the LLaVA version",
|
| 1985 |
options_llava,
|
| 1986 |
index=default_index_llava,
|
| 1987 |
help="",captions=captions_llava,
|
|
@@ -1989,12 +2057,13 @@ def content_ocr_method():
|
|
| 1989 |
st.session_state.config['leafmachine']['project']['OCR_option_llava'] = OCR_option_llava
|
| 1990 |
|
| 1991 |
OCR_option_llava_bit = st.radio(
|
| 1992 |
-
"Select the LLaVA quantization level",
|
| 1993 |
options_llava_bit,
|
| 1994 |
index=default_index_llava_bit,
|
| 1995 |
help="",captions=captions_llava_bit,
|
| 1996 |
)
|
| 1997 |
st.session_state.config['leafmachine']['project']['OCR_option_llava_bit'] = OCR_option_llava_bit
|
|
|
|
| 1998 |
|
| 1999 |
|
| 2000 |
|
|
@@ -2045,7 +2114,6 @@ def show_ocr():
|
|
| 2045 |
# st.image(st.session_state["demo_overlay"], caption='OCR Overlay Images', output_format = "JPEG")
|
| 2046 |
|
| 2047 |
def content_collage_overlay():
|
| 2048 |
-
st.markdown("---")
|
| 2049 |
col_collage, col_overlay = st.columns([4,4])
|
| 2050 |
|
| 2051 |
|
|
|
|
| 218 |
st.session_state['dir_images_local_TEMP'] = False
|
| 219 |
if 'dir_uploaded_images' not in st.session_state:
|
| 220 |
st.session_state['dir_uploaded_images'] = os.path.join(st.session_state.dir_home,'uploads')
|
| 221 |
+
validate_dir(os.path.join(st.session_state.dir_home,'uploads'))
|
| 222 |
if 'dir_uploaded_images_small' not in st.session_state:
|
| 223 |
st.session_state['dir_uploaded_images_small'] = os.path.join(st.session_state.dir_home,'uploads_small')
|
| 224 |
+
validate_dir(os.path.join(st.session_state.dir_home,'uploads_small'))
|
| 225 |
|
| 226 |
|
| 227 |
|
|
|
|
| 264 |
|
| 265 |
ind_small = 0
|
| 266 |
for uploaded_file in uploaded_files:
|
| 267 |
+
|
| 268 |
if SAFE.check_for_inappropriate_content(uploaded_file):
|
| 269 |
clear_image_uploads()
|
| 270 |
report_violation(uploaded_file.name, is_hf=st.session_state['is_hf'])
|
| 271 |
st.error("Warning: You uploaded an image that violates our terms of service.")
|
| 272 |
+
return True
|
| 273 |
|
| 274 |
|
| 275 |
# Determine the file type
|
| 276 |
if uploaded_file.name.lower().endswith('.pdf'):
|
| 277 |
# Handle PDF files
|
| 278 |
+
file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file)
|
| 279 |
# Convert each page of the PDF to an image
|
| 280 |
n_pages = convert_pdf_to_jpg(file_path, st.session_state['dir_uploaded_images'], dpi=200)#st.session_state.config['leafmachine']['project']['dir_images_local'])
|
| 281 |
# Update the input list for each page image
|
|
|
|
| 290 |
# Optionally, create a thumbnail for the gallery
|
| 291 |
img = Image.open(jpg_file_path)
|
| 292 |
img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
|
| 293 |
+
try:
|
| 294 |
file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], file_name, img)
|
| 295 |
+
except:
|
| 296 |
file_path_small = save_uploaded_file_local(st.session_state['dir_uploaded_images_small'],st.session_state['dir_uploaded_images_small'], file_name, img)
|
| 297 |
st.session_state['input_list_small'].append(file_path_small)
|
| 298 |
|
| 299 |
else:
|
| 300 |
ind_small += 1
|
| 301 |
# Handle JPG/JPEG files (existing process)
|
| 302 |
+
file_path = save_uploaded_file(st.session_state['dir_uploaded_images'], uploaded_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
st.session_state['input_list'].append(file_path)
|
| 304 |
+
if ind_small < MAX_GALLERY_IMAGES +5:
|
| 305 |
+
img = Image.open(file_path)
|
| 306 |
+
img.thumbnail((GALLERY_IMAGE_SIZE, GALLERY_IMAGE_SIZE), Image.Resampling.LANCZOS)
|
| 307 |
+
file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
|
| 308 |
+
st.session_state['input_list_small'].append(file_path_small)
|
| 309 |
|
| 310 |
# After processing all files
|
| 311 |
st.session_state.config['leafmachine']['project']['dir_images_local'] = st.session_state['dir_uploaded_images']
|
|
|
|
| 393 |
|
| 394 |
with col_right:
|
| 395 |
if st.session_state.is_hf:
|
| 396 |
+
result = handle_image_upload_and_gallery_hf(uploaded_files)
|
| 397 |
|
| 398 |
else:
|
| 399 |
st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
|
|
|
|
| 1764 |
st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
|
| 1765 |
|
| 1766 |
|
| 1767 |
+
# st.header('LLM Version')
|
| 1768 |
+
# col_llm_1, col_llm_2 = st.columns([4,2])
|
| 1769 |
|
| 1770 |
+
# with col_llm_1:
|
| 1771 |
+
# GUI_MODEL_LIST = ModelMaps.get_models_gui_list()
|
| 1772 |
+
# st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
|
| 1773 |
+
|
| 1774 |
+
|
| 1775 |
+
# Determine the default family based on the default model
|
| 1776 |
+
default_model = ModelMaps.MODELS_GUI_DEFAULT
|
| 1777 |
+
default_family = None
|
| 1778 |
+
for family, models in ModelMaps.MODEL_FAMILY.items():
|
| 1779 |
+
if default_model in models:
|
| 1780 |
+
default_family = family
|
| 1781 |
+
break
|
| 1782 |
+
|
| 1783 |
+
st.header("LLM Version")
|
| 1784 |
+
|
| 1785 |
+
col_llm_1, col_llm_2 = st.columns([4, 2])
|
| 1786 |
with col_llm_1:
|
| 1787 |
+
# Step 1: Select Model Family with default family pre-selected
|
| 1788 |
+
family_list = list(ModelMaps.MODEL_FAMILY.keys())
|
| 1789 |
+
selected_family = st.selectbox("Select Model Family", family_list, index=family_list.index(default_family) if default_family else 0)
|
| 1790 |
+
|
| 1791 |
+
# Step 2: Display Models based on selected family
|
| 1792 |
+
GUI_MODEL_LIST = ModelMaps.get_models_gui_list_family(selected_family)
|
| 1793 |
+
|
| 1794 |
+
# Ensure the selected model is part of the current family; if not, use the default of this family
|
| 1795 |
+
selected_model_default = st.session_state.config['leafmachine'].get('LLM_version', default_model)
|
| 1796 |
+
if selected_model_default not in GUI_MODEL_LIST:
|
| 1797 |
+
selected_model_default = GUI_MODEL_LIST[0]
|
| 1798 |
+
|
| 1799 |
+
selected_model = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(selected_model_default))
|
| 1800 |
+
|
| 1801 |
+
# Update the session state with the selected model
|
| 1802 |
+
st.session_state.config['leafmachine']['LLM_version'] = selected_model
|
| 1803 |
+
|
| 1804 |
+
|
| 1805 |
+
|
| 1806 |
+
|
| 1807 |
+
|
| 1808 |
st.markdown("""
|
| 1809 |
Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
|
| 1810 |
- Any Mistral model e.g., `Mistral Large`
|
|
|
|
| 1847 |
|
| 1848 |
|
| 1849 |
|
| 1850 |
+
def adjust_ocr_options_based_on_capability(capability_score, model_name='llava'):
|
| 1851 |
+
if model_name == 'llava':
|
| 1852 |
+
llava_models_requirements = {
|
| 1853 |
+
"liuhaotian/llava-v1.6-mistral-7b": {"full": 18, "4bit": 9},
|
| 1854 |
+
"liuhaotian/llava-v1.6-34b": {"full": 70, "4bit": 25},
|
| 1855 |
+
"liuhaotian/llava-v1.6-vicuna-13b": {"full": 33, "4bit": 15},
|
| 1856 |
+
"liuhaotian/llava-v1.6-vicuna-7b": {"full": 20, "4bit": 10},
|
| 1857 |
+
}
|
| 1858 |
+
if capability_score == 'no_gpu':
|
| 1859 |
+
return False
|
| 1860 |
+
else:
|
| 1861 |
+
capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
|
| 1862 |
+
supported_models = [model for model, reqs in llava_models_requirements.items()
|
| 1863 |
+
if reqs["full"] <= capability_score_n or reqs["4bit"] <= capability_score_n]
|
| 1864 |
+
|
| 1865 |
+
# If no models are supported, disable the LLaVA option
|
| 1866 |
+
if not supported_models:
|
| 1867 |
+
# Assuming the LLaVA option is the last in your list
|
| 1868 |
+
return False # Indicate LLaVA is not supported
|
| 1869 |
+
return True # Indicate LLaVA is supported
|
| 1870 |
+
elif model_name == 'florence-2':
|
| 1871 |
+
florence_models_requirements = {
|
| 1872 |
+
"microsoft/Florence-2-large": {"full": 16,},
|
| 1873 |
+
"microsoft/Florence-2-base": {"full": 12,},
|
| 1874 |
+
}
|
| 1875 |
+
if capability_score == 'no_gpu':
|
| 1876 |
+
return False
|
| 1877 |
+
else:
|
| 1878 |
+
capability_score_n = int(capability_score.split("_")[1].split("GB")[0])
|
| 1879 |
+
supported_models = [model for model, reqs in florence_models_requirements.items()
|
| 1880 |
+
if reqs["full"] <= capability_score_n]
|
| 1881 |
|
| 1882 |
+
# If no models are supported, disable the model option
|
| 1883 |
+
if not supported_models:
|
| 1884 |
+
# Assuming the model option is the last in your list
|
| 1885 |
+
return False # Indicate model is not supported
|
| 1886 |
+
return True # Indicate model is supported
|
| 1887 |
|
| 1888 |
|
| 1889 |
|
|
|
|
| 1917 |
|
| 1918 |
c1, c2 = st.columns([4,4])
|
| 1919 |
|
| 1920 |
+
with c2:
|
| 1921 |
+
st.subheader("Local Methods")
|
| 1922 |
+
st.write("Local methods are free, but require a capable GPU. ")
|
| 1923 |
+
# Check if LLaVA models are supported based on capability score
|
| 1924 |
+
llava_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score, model_name='llava')
|
| 1925 |
+
florence_supported = adjust_ocr_options_based_on_capability(st.session_state.capability_score, model_name='florence-2')
|
| 1926 |
+
|
| 1927 |
+
if llava_supported:
|
| 1928 |
+
st.success("LLaVA models are supported on this computer. A GPU with at least 12 GB of VRAM is available.")
|
| 1929 |
+
else:
|
| 1930 |
+
st.warning("LLaVA models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
|
| 1931 |
+
|
| 1932 |
+
if llava_supported:
|
| 1933 |
+
st.success("Florence-2 models are supported on this computer. A GPU with at least 12 GB of VRAM is available.")
|
| 1934 |
+
else:
|
| 1935 |
+
st.warning("Florence-2 models are NOT supported on this computer. Requires a GPU with at least 12 GB of VRAM.")
|
| 1936 |
|
| 1937 |
demo_text_h = f"Google_OCR_Handwriting:\nHERBARIUM OF MARCUS W. LYON , JR . Tracaulon sagittatum Indiana : Porter Co. incal Springs edge wet subdunal woods 1927 TX 11 Ilowers pink UNIVERSITE HERBARIUM MICH University of Michigan Herbarium 1439649 copyright reserved PERSICARIA FEB 2 6 1965 cm "
|
| 1938 |
demo_text_tr = f"trOCR:\nherbarium of marcus w. lyon jr. : : : tracaulon sagittatum indiana porter co. incal springs TX 11 Ilowers pink 1439649 copyright reserved D H U Q "
|
|
|
|
| 1942 |
demo_text_trh = demo_text_h + '\n' + demo_text_tr
|
| 1943 |
demo_text_trp = demo_text_p + '\n' + demo_text_tr
|
| 1944 |
|
| 1945 |
+
options = ["Google Vision Handwritten", "Google Vision Printed", "Florence-2", "GPT-4o-mini", "CRAFT + trOCR","LLaVA", ]
|
| 1946 |
options_llava = ["llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",]
|
| 1947 |
options_llava_bit = ["full", "4bit",]
|
| 1948 |
captions_llava = [
|
|
|
|
| 1965 |
default_index_llava_bit = 0
|
| 1966 |
with c1:
|
| 1967 |
st.subheader("API Methods (Google Vision)")
|
| 1968 |
+
st.write("Using APIs for OCR allows VoucherVision to run on most computers. You can use multiple OCR engines simultaneously.")
|
| 1969 |
|
| 1970 |
st.session_state.config['leafmachine']['project']['double_OCR'] = st.checkbox(label="Send 2 copies of the OCR to the LLM",
|
| 1971 |
help="This can help the LLMs focus attention on the OCR and not get lost in the longer instruction text",
|
|
|
|
| 1994 |
"CRAFT + trOCR": 'CRAFT',
|
| 1995 |
"LLaVA": 'LLaVA',
|
| 1996 |
"Florence-2": 'Florence-2',
|
| 1997 |
+
"GPT-4o-mini": "GPT-4o-mini",
|
| 1998 |
}
|
| 1999 |
|
| 2000 |
# Map selected options to their corresponding internal representations
|
|
|
|
| 2004 |
st.session_state.config['leafmachine']['project']['OCR_option'] = selected_OCR_options
|
| 2005 |
|
| 2006 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2007 |
|
| 2008 |
+
|
| 2009 |
if 'CRAFT' in selected_OCR_options:
|
| 2010 |
+
st.subheader('Options for :blue[CRAFT + trOCR]')
|
| 2011 |
+
st.write("Supplement Google Vision OCR with :blue[trOCR] (handwriting OCR) using `microsoft/trocr-base-handwritten`. This option requires Google Vision API and a GPU.")
|
| 2012 |
+
if 'CRAFT' in selected_OCR_options:
|
| 2013 |
+
do_use_trOCR = st.checkbox("Enable :blue[trOCR]", value=True, key="Enable trOCR1",disabled=True)#,disabled=st.session_state['lacks_GPU'])
|
| 2014 |
+
else:
|
| 2015 |
+
do_use_trOCR = st.checkbox("Enable :blue[trOCR]", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
|
| 2016 |
+
st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
|
| 2017 |
+
|
| 2018 |
+
if do_use_trOCR:
|
| 2019 |
+
# st.session_state.config['leafmachine']['project']['trOCR_model_path'] = "microsoft/trocr-large-handwritten"
|
| 2020 |
+
default_trOCR_model_path = st.session_state.config['leafmachine']['project']['trOCR_model_path']
|
| 2021 |
+
user_input_trOCR_model_path = st.text_input(":blue[trOCR] Hugging Face model path. MUST be a fine-tuned version of 'microsoft/trocr-base-handwritten' or 'microsoft/trocr-large-handwritten', or a microsoft :blue[trOCR] model.", value=default_trOCR_model_path)
|
| 2022 |
+
if st.session_state.config['leafmachine']['project']['trOCR_model_path'] != user_input_trOCR_model_path:
|
| 2023 |
+
is_valid_mp = is_valid_huggingface_model_path(user_input_trOCR_model_path)
|
| 2024 |
+
if not is_valid_mp:
|
| 2025 |
+
st.error(f"The Hugging Face model path {user_input_trOCR_model_path} is not valid. Please revise.")
|
| 2026 |
+
else:
|
| 2027 |
+
st.session_state.config['leafmachine']['project']['trOCR_model_path'] = user_input_trOCR_model_path
|
| 2028 |
|
| 2029 |
|
| 2030 |
if "Florence-2" in selected_OCR_options:
|
| 2031 |
+
st.subheader('Options for :green[Florence-2]')
|
| 2032 |
default_florence_model_path = st.session_state.config['leafmachine']['project']['florence_model_path']
|
|
|
|
| 2033 |
|
| 2034 |
+
st.session_state.config['leafmachine']['project']['florence_model_path'] = st.radio(
|
| 2035 |
+
"Select :green[Florence-2] version.",
|
| 2036 |
+
["microsoft/Florence-2-large", "microsoft/Florence-2-base", ],
|
| 2037 |
+
captions=["'large' requires at least 16GB of VRAM", "'base' requires 12GB of VRAM."])
|
| 2038 |
+
|
| 2039 |
+
if "GPT-4o-mini" in selected_OCR_options:
|
| 2040 |
+
st.subheader('Options for :violet[GPT-4o-mini]')
|
| 2041 |
+
default_resolution = st.session_state.config['leafmachine']['project']['OCR_GPT_4o_mini_resolution']
|
| 2042 |
+
|
| 2043 |
+
st.session_state.config['leafmachine']['project']['OCR_GPT_4o_mini_resolution'] = st.radio(
|
| 2044 |
+
"Select level of detail for :violet[GPT-4o-mini] OCR. We only recommend 'high' detail in most scenarios.",
|
| 2045 |
+
["high", "low", ],
|
| 2046 |
+
captions=["$0.50 per 1,000", "\$5 - \$10 per 1,000"])
|
| 2047 |
|
| 2048 |
|
| 2049 |
if 'LLaVA' in selected_OCR_options:
|
| 2050 |
+
st.subheader('Options for :red[LLaVA]')
|
| 2051 |
OCR_option_llava = st.radio(
|
| 2052 |
+
"Select the :red[LLaVA] version",
|
| 2053 |
options_llava,
|
| 2054 |
index=default_index_llava,
|
| 2055 |
help="",captions=captions_llava,
|
|
|
|
| 2057 |
st.session_state.config['leafmachine']['project']['OCR_option_llava'] = OCR_option_llava
|
| 2058 |
|
| 2059 |
OCR_option_llava_bit = st.radio(
|
| 2060 |
+
"Select the :red[LLaVA] quantization level",
|
| 2061 |
options_llava_bit,
|
| 2062 |
index=default_index_llava_bit,
|
| 2063 |
help="",captions=captions_llava_bit,
|
| 2064 |
)
|
| 2065 |
st.session_state.config['leafmachine']['project']['OCR_option_llava_bit'] = OCR_option_llava_bit
|
| 2066 |
+
st.write('---')
|
| 2067 |
|
| 2068 |
|
| 2069 |
|
|
|
|
| 2114 |
# st.image(st.session_state["demo_overlay"], caption='OCR Overlay Images', output_format = "JPEG")
|
| 2115 |
|
| 2116 |
def content_collage_overlay():
|
|
|
|
| 2117 |
col_collage, col_overlay = st.columns([4,4])
|
| 2118 |
|
| 2119 |
|
vouchervision/OCR_Florence_2.py
CHANGED
|
@@ -6,12 +6,18 @@ import matplotlib.patches as patches
|
|
| 6 |
from PIL import Image, ImageDraw, ImageFont
|
| 7 |
import numpy as np
|
| 8 |
import warnings
|
| 9 |
-
from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
|
| 13 |
|
| 14 |
class FlorenceOCR:
|
|
|
|
| 15 |
def __init__(self, logger, model_id='microsoft/Florence-2-large'):
|
| 16 |
self.MAX_TOKENS = 1024
|
| 17 |
self.logger = logger
|
|
@@ -25,7 +31,15 @@ class FlorenceOCR:
|
|
| 25 |
# self.model_id_clean = "mistralai/Mistral-7B-v0.3"
|
| 26 |
self.model_id_clean = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
|
| 27 |
self.tokenizer_clean = AutoTokenizer.from_pretrained(self.model_id_clean)
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def ocr_florence(self, image, task_prompt='<OCR>', text_input=None):
|
|
@@ -54,34 +68,46 @@ class FlorenceOCR:
|
|
| 54 |
num_beams=3,
|
| 55 |
)
|
| 56 |
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
| 57 |
-
|
| 58 |
generated_text,
|
| 59 |
task=task_prompt,
|
| 60 |
image_size=(image.width, image.height)
|
| 61 |
)
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
| 67 |
-
parsed_answer = self.tokenizer_clean.decode(outputs[0], skip_special_tokens=True)
|
| 68 |
-
print(parsed_answer_dirty)
|
| 69 |
-
print(parsed_answer)
|
| 70 |
|
| 71 |
self.monitor.stop_inference_timer() # Starts tool timer too
|
| 72 |
usage_report = self.monitor.stop_monitoring_report_usage()
|
| 73 |
|
| 74 |
-
return
|
| 75 |
|
| 76 |
|
| 77 |
def main():
|
| 78 |
-
img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
|
| 79 |
-
|
| 80 |
|
| 81 |
image = Image.open(img_path)
|
| 82 |
|
| 83 |
-
ocr = FlorenceOCR(logger = None)
|
| 84 |
-
|
|
|
|
| 85 |
print(results_text)
|
| 86 |
|
| 87 |
if __name__ == '__main__':
|
|
|
|
| 6 |
from PIL import Image, ImageDraw, ImageFont
|
| 7 |
import numpy as np
|
| 8 |
import warnings
|
| 9 |
+
from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from vouchervision.utils_LLM import SystemLoadMonitor
|
| 13 |
+
except:
|
| 14 |
+
from utils_LLM import SystemLoadMonitor
|
| 15 |
+
|
| 16 |
|
| 17 |
warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
|
| 18 |
|
| 19 |
class FlorenceOCR:
|
| 20 |
+
# def __init__(self, logger, model_id='microsoft/Florence-2-base'):
|
| 21 |
def __init__(self, logger, model_id='microsoft/Florence-2-large'):
|
| 22 |
self.MAX_TOKENS = 1024
|
| 23 |
self.logger = logger
|
|
|
|
| 31 |
# self.model_id_clean = "mistralai/Mistral-7B-v0.3"
|
| 32 |
self.model_id_clean = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
|
| 33 |
self.tokenizer_clean = AutoTokenizer.from_pretrained(self.model_id_clean)
|
| 34 |
+
# Configuring the BitsAndBytesConfig for quantization
|
| 35 |
+
quant_config = BitsAndBytesConfig(
|
| 36 |
+
load_in_4bit=True,
|
| 37 |
+
quant_method="bnb",
|
| 38 |
+
)
|
| 39 |
+
self.model_clean = AutoModelForCausalLM.from_pretrained(
|
| 40 |
+
self.model_id_clean,
|
| 41 |
+
quantization_config=quant_config,
|
| 42 |
+
low_cpu_mem_usage=True,)
|
| 43 |
|
| 44 |
|
| 45 |
def ocr_florence(self, image, task_prompt='<OCR>', text_input=None):
|
|
|
|
| 68 |
num_beams=3,
|
| 69 |
)
|
| 70 |
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
| 71 |
+
parsed_answer_dict = self.processor.post_process_generation(
|
| 72 |
generated_text,
|
| 73 |
task=task_prompt,
|
| 74 |
image_size=(image.width, image.height)
|
| 75 |
)
|
| 76 |
|
| 77 |
+
parsed_answer_text = parsed_answer_dict[task_prompt]
|
| 78 |
+
|
| 79 |
+
# Prepare input for the second model
|
| 80 |
+
inputs_clean = self.tokenizer_clean(
|
| 81 |
+
f"Insert spaces into this text to make all the words valid. This text contains scientific names of plants, locations, habitat, coordinate words: {parsed_answer_text}",
|
| 82 |
+
return_tensors="pt"
|
| 83 |
+
)
|
| 84 |
+
inputs_clean = {key: value.to(self.model_clean.device) for key, value in inputs_clean.items()}
|
| 85 |
+
|
| 86 |
+
outputs_clean = self.model_clean.generate(**inputs_clean, max_new_tokens=self.MAX_TOKENS)
|
| 87 |
+
text_with_spaces = self.tokenizer_clean.decode(outputs_clean[0], skip_special_tokens=True)
|
| 88 |
+
|
| 89 |
+
# Extract only the LLM response from the decoded text
|
| 90 |
+
response_start = text_with_spaces.find(parsed_answer_text)
|
| 91 |
+
if response_start != -1:
|
| 92 |
+
text_with_spaces = text_with_spaces[response_start + len(parsed_answer_text):].strip()
|
| 93 |
|
| 94 |
+
print(text_with_spaces)
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
self.monitor.stop_inference_timer() # Starts tool timer too
|
| 97 |
usage_report = self.monitor.stop_monitoring_report_usage()
|
| 98 |
|
| 99 |
+
return text_with_spaces, parsed_answer_text, parsed_answer_dict, usage_report
|
| 100 |
|
| 101 |
|
| 102 |
def main():
|
| 103 |
+
# img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
|
| 104 |
+
img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
|
| 105 |
|
| 106 |
image = Image.open(img_path)
|
| 107 |
|
| 108 |
+
# ocr = FlorenceOCR(logger = None, model_id='microsoft/Florence-2-base')
|
| 109 |
+
ocr = FlorenceOCR(logger = None, model_id='microsoft/Florence-2-large')
|
| 110 |
+
results_text, results_all, results_dirty, usage_report = ocr.ocr_florence(image, task_prompt='<OCR>', text_input=None)
|
| 111 |
print(results_text)
|
| 112 |
|
| 113 |
if __name__ == '__main__':
|
vouchervision/OCR_GPT4oMini.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, base64, requests, yaml
|
| 2 |
+
from PIL import Image
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
|
| 5 |
+
from general_utils import calculate_cost
|
| 6 |
+
|
| 7 |
+
# PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}"""
|
| 8 |
+
PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:"""
|
| 9 |
+
|
| 10 |
+
class GPT4oMiniOCR:
|
| 11 |
+
def __init__(self, api_key):
|
| 12 |
+
self.api_key = api_key
|
| 13 |
+
self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml')
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def encode_image(self, image_path):
|
| 17 |
+
with open(image_path, "rb") as image_file:
|
| 18 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
| 19 |
+
|
| 20 |
+
def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512):
|
| 21 |
+
# Getting the base64 string
|
| 22 |
+
base64_image = self.encode_image(image_path)
|
| 23 |
+
|
| 24 |
+
headers = {
|
| 25 |
+
"Content-Type": "application/json",
|
| 26 |
+
"Authorization": f"Bearer {self.api_key}"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
payload = {
|
| 30 |
+
"model": "gpt-4o-mini",
|
| 31 |
+
"messages": [
|
| 32 |
+
{
|
| 33 |
+
"role": "user",
|
| 34 |
+
"content": [
|
| 35 |
+
{
|
| 36 |
+
"type": "text",
|
| 37 |
+
"text": PROMPT,
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"type": "image_url",
|
| 41 |
+
"image_url": {
|
| 42 |
+
"url": f"data:image/jpeg;base64,{base64_image}",
|
| 43 |
+
"detail": resolution,
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
]
|
| 47 |
+
}
|
| 48 |
+
],
|
| 49 |
+
"max_tokens": max_tokens
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
| 53 |
+
response_json = response.json()
|
| 54 |
+
|
| 55 |
+
if "choices" in response_json :
|
| 56 |
+
parsed_answer = response_json["choices"][0]["message"]["content"]
|
| 57 |
+
else:
|
| 58 |
+
parsed_answer = None
|
| 59 |
+
|
| 60 |
+
usage_report = response_json.get('usage', {})
|
| 61 |
+
tokens_in = usage_report["prompt_tokens"]
|
| 62 |
+
tokens_out = usage_report["completion_tokens"]
|
| 63 |
+
|
| 64 |
+
total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out)
|
| 65 |
+
cost_in, cost_out, total_cost, rates_in, rates_out = total_cost
|
| 66 |
+
|
| 67 |
+
return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def main():
|
| 73 |
+
# img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
|
| 74 |
+
img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
|
| 75 |
+
|
| 76 |
+
# $env:OPENAI_API_KEY="KEY"
|
| 77 |
+
API_KEY = "sk-proj-DxHlMH1H6jZzs8V12qbLT3BlbkFJIJnAVzt4kquOfhGURGW0"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
ocr = GPT4oMiniOCR(API_KEY)
|
| 81 |
+
|
| 82 |
+
parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512)
|
| 83 |
+
print(f"Parsed Answer: {parsed_answer}")
|
| 84 |
+
print(f"Total Cost: {total_cost}")
|
| 85 |
+
|
| 86 |
+
parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512)
|
| 87 |
+
print(f"Parsed Answer: {parsed_answer}")
|
| 88 |
+
print(f"Total Cost: {total_cost}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == '__main__':
|
| 94 |
+
main()
|
vouchervision/OCR_google_cloud_vision.py
CHANGED
|
@@ -8,6 +8,7 @@ import colorsys
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from google.oauth2 import service_account
|
| 10 |
from OCR_Florence_2 import FlorenceOCR
|
|
|
|
| 11 |
### LLaVA should only be installed if the user will actually use it.
|
| 12 |
### It requires the most recent pytorch/Python and can mess with older systems
|
| 13 |
|
|
@@ -56,6 +57,11 @@ class OCREngine:
|
|
| 56 |
|
| 57 |
self.OCR_JSON_to_file = {}
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
self.hand_cleaned_text = None
|
| 60 |
self.hand_organized_text = None
|
| 61 |
self.hand_bounds = None
|
|
@@ -84,6 +90,7 @@ class OCREngine:
|
|
| 84 |
self.trOCR_characters = None
|
| 85 |
self.set_client()
|
| 86 |
self.init_florence()
|
|
|
|
| 87 |
self.init_craft()
|
| 88 |
|
| 89 |
self.multimodal_prompt = """I need you to transcribe all of the text in this image.
|
|
@@ -125,6 +132,10 @@ class OCREngine:
|
|
| 125 |
if 'Florence-2' in self.OCR_option:
|
| 126 |
self.Florence = FlorenceOCR(logger=self.logger, model_id=self.cfg['leafmachine']['project']['florence_model_path'])
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
def init_llava(self):
|
| 129 |
if 'LLaVA' in self.OCR_option:
|
| 130 |
from vouchervision.OCR_llava import OCRllava
|
|
@@ -701,7 +712,7 @@ class OCREngine:
|
|
| 701 |
|
| 702 |
if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
|
| 703 |
if self.json_report:
|
| 704 |
-
self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path}
|
| 705 |
|
| 706 |
image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
|
| 707 |
self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
|
|
@@ -716,7 +727,7 @@ class OCREngine:
|
|
| 716 |
|
| 717 |
if 'Florence-2' in self.OCR_option: # This option does not produce an OCR helper image
|
| 718 |
if self.json_report:
|
| 719 |
-
self.json_report.set_text(text_main=f'Working on Florence-2 [{self.Florence.model_id}]
|
| 720 |
|
| 721 |
self.logger.info(f"Florence-2 Usage Report for Model [{self.Florence.model_id}]")
|
| 722 |
results_text, results_text_dirty, results, usage_report = self.Florence.ocr_florence(self.path, task_prompt='<OCR>', text_input=None)
|
|
@@ -728,6 +739,21 @@ class OCREngine:
|
|
| 728 |
else:
|
| 729 |
self.OCR = self.OCR + f"\nFlorence-2 OCR:\n{results_text}"
|
| 730 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
|
| 732 |
if 'normal' in self.OCR_option:
|
| 733 |
if self.double_OCR:
|
|
@@ -824,48 +850,44 @@ class SafetyCheck():
|
|
| 824 |
else:
|
| 825 |
self.client = vision.ImageAnnotatorClient(credentials=self.get_google_credentials())
|
| 826 |
|
|
|
|
| 827 |
def get_google_credentials(self):
|
| 828 |
creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
|
| 829 |
credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
|
| 830 |
return credentials
|
| 831 |
|
| 832 |
def check_for_inappropriate_content(self, file_stream):
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
#
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
print("Found NO violation")
|
| 869 |
-
return False # The image is considered safe.
|
| 870 |
-
except:
|
| 871 |
-
return False # The image is considered safe. TEMPOROARY FIX TODO
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from google.oauth2 import service_account
|
| 10 |
from OCR_Florence_2 import FlorenceOCR
|
| 11 |
+
from OCR_GPT4oMini import GPT4oMiniOCR
|
| 12 |
### LLaVA should only be installed if the user will actually use it.
|
| 13 |
### It requires the most recent pytorch/Python and can mess with older systems
|
| 14 |
|
|
|
|
| 57 |
|
| 58 |
self.OCR_JSON_to_file = {}
|
| 59 |
|
| 60 |
+
# for paid vLM OCR like GPT-vision
|
| 61 |
+
self.cost = 0.0
|
| 62 |
+
self.tokens_in = 0
|
| 63 |
+
self.tokens_out = 0
|
| 64 |
+
|
| 65 |
self.hand_cleaned_text = None
|
| 66 |
self.hand_organized_text = None
|
| 67 |
self.hand_bounds = None
|
|
|
|
| 90 |
self.trOCR_characters = None
|
| 91 |
self.set_client()
|
| 92 |
self.init_florence()
|
| 93 |
+
self.init_gpt_4o_mini()
|
| 94 |
self.init_craft()
|
| 95 |
|
| 96 |
self.multimodal_prompt = """I need you to transcribe all of the text in this image.
|
|
|
|
| 132 |
if 'Florence-2' in self.OCR_option:
|
| 133 |
self.Florence = FlorenceOCR(logger=self.logger, model_id=self.cfg['leafmachine']['project']['florence_model_path'])
|
| 134 |
|
| 135 |
+
def init_gpt_4o_mini(self):
|
| 136 |
+
if 'GPT-4o-mini' in self.OCR_option:
|
| 137 |
+
self.GPTmini = GPT4oMiniOCR(api_key = os.getenv('OPENAI_API_KEY'))
|
| 138 |
+
|
| 139 |
def init_llava(self):
|
| 140 |
if 'LLaVA' in self.OCR_option:
|
| 141 |
from vouchervision.OCR_llava import OCRllava
|
|
|
|
| 712 |
|
| 713 |
if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
|
| 714 |
if self.json_report:
|
| 715 |
+
self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} OCR :construction:')
|
| 716 |
|
| 717 |
image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
|
| 718 |
self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
|
|
|
|
| 727 |
|
| 728 |
if 'Florence-2' in self.OCR_option: # This option does not produce an OCR helper image
|
| 729 |
if self.json_report:
|
| 730 |
+
self.json_report.set_text(text_main=f'Working on Florence-2 [{self.Florence.model_id}] OCR :construction:')
|
| 731 |
|
| 732 |
self.logger.info(f"Florence-2 Usage Report for Model [{self.Florence.model_id}]")
|
| 733 |
results_text, results_text_dirty, results, usage_report = self.Florence.ocr_florence(self.path, task_prompt='<OCR>', text_input=None)
|
|
|
|
| 739 |
else:
|
| 740 |
self.OCR = self.OCR + f"\nFlorence-2 OCR:\n{results_text}"
|
| 741 |
|
| 742 |
+
if 'GPT-4o-mini' in self.OCR_option: # This option does not produce an OCR helper image
|
| 743 |
+
if self.json_report:
|
| 744 |
+
self.json_report.set_text(text_main=f'Working on GPT-4o-mini OCR :construction:')
|
| 745 |
+
|
| 746 |
+
self.logger.info(f"GPT-4o-mini Usage Report")
|
| 747 |
+
results_text, cost_in, cost_out, total_cost, rates_in, rates_out, self.tokens_in, self.tokens_out = self.GPTmini.ocr_gpt4o(self.path, resolution=self.cfg['leafmachine']['project']['OCR_GPT_4o_mini_resolution'], max_tokens=512)
|
| 748 |
+
self.cost += total_cost
|
| 749 |
+
|
| 750 |
+
self.OCR_JSON_to_file['OCR_GPT_4o_mini'] = results_text
|
| 751 |
+
|
| 752 |
+
if self.double_OCR:
|
| 753 |
+
self.OCR = self.OCR + f"\nGPT-4o-mini OCR:\n{results_text}" + f"\nGPT-4o-mini OCR:\n{results_text}"
|
| 754 |
+
else:
|
| 755 |
+
self.OCR = self.OCR + f"\nGPT-4o-mini OCR:\n{results_text}"
|
| 756 |
+
|
| 757 |
if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
|
| 758 |
if 'normal' in self.OCR_option:
|
| 759 |
if self.double_OCR:
|
|
|
|
| 850 |
else:
|
| 851 |
self.client = vision.ImageAnnotatorClient(credentials=self.get_google_credentials())
|
| 852 |
|
| 853 |
+
|
| 854 |
def get_google_credentials(self):
|
| 855 |
creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
|
| 856 |
credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
|
| 857 |
return credentials
|
| 858 |
|
| 859 |
def check_for_inappropriate_content(self, file_stream):
|
| 860 |
+
LEVEL = 2
|
| 861 |
+
content = file_stream.read()
|
| 862 |
+
image = vision.Image(content=content)
|
| 863 |
+
response = self.client.safe_search_detection(image=image)
|
| 864 |
+
safe = response.safe_search_annotation
|
| 865 |
+
|
| 866 |
+
likelihood_name = (
|
| 867 |
+
"UNKNOWN",
|
| 868 |
+
"VERY_UNLIKELY",
|
| 869 |
+
"UNLIKELY",
|
| 870 |
+
"POSSIBLE",
|
| 871 |
+
"LIKELY",
|
| 872 |
+
"VERY_LIKELY",
|
| 873 |
+
)
|
| 874 |
+
print("Safe search:")
|
| 875 |
+
|
| 876 |
+
print(f" adult*: {likelihood_name[safe.adult]}")
|
| 877 |
+
print(f" medical*: {likelihood_name[safe.medical]}")
|
| 878 |
+
print(f" spoofed: {likelihood_name[safe.spoof]}")
|
| 879 |
+
print(f" violence*: {likelihood_name[safe.violence]}")
|
| 880 |
+
print(f" racy: {likelihood_name[safe.racy]}")
|
| 881 |
+
|
| 882 |
+
# Check the levels of adult, violence, racy, etc. content.
|
| 883 |
+
if (safe.adult > LEVEL or
|
| 884 |
+
safe.medical > LEVEL or
|
| 885 |
+
# safe.spoof > LEVEL or
|
| 886 |
+
safe.violence > LEVEL #or
|
| 887 |
+
# safe.racy > LEVEL
|
| 888 |
+
):
|
| 889 |
+
print("Found violation")
|
| 890 |
+
return True # The image violates safe search guidelines.
|
| 891 |
+
|
| 892 |
+
print("Found NO violation")
|
| 893 |
+
return False # The image is considered safe.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vouchervision/VoucherVision_Config_Builder.py
CHANGED
|
@@ -42,6 +42,7 @@ def build_VV_config(loaded_cfg=None):
|
|
| 42 |
OCR_option = 'hand'
|
| 43 |
OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
|
| 44 |
OCR_option_llava_bit = 'full' # full or 4bit
|
|
|
|
| 45 |
double_OCR = False
|
| 46 |
|
| 47 |
tool_GEO = True
|
|
@@ -73,7 +74,7 @@ def build_VV_config(loaded_cfg=None):
|
|
| 73 |
prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
|
| 74 |
path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
|
| 75 |
prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
|
| 76 |
-
OCR_option_llava_bit, double_OCR, save_cropped_annotations,
|
| 77 |
tool_GEO, tool_WFO, tool_wikipedia,
|
| 78 |
check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
|
| 79 |
else:
|
|
@@ -95,6 +96,7 @@ def build_VV_config(loaded_cfg=None):
|
|
| 95 |
OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
|
| 96 |
OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
|
| 97 |
OCR_option_llava_bit = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
|
|
|
|
| 98 |
double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
|
| 99 |
|
| 100 |
tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
|
|
@@ -122,7 +124,7 @@ def build_VV_config(loaded_cfg=None):
|
|
| 122 |
prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
|
| 123 |
path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
|
| 124 |
prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
|
| 125 |
-
OCR_option_llava_bit, double_OCR, save_cropped_annotations,
|
| 126 |
tool_GEO, tool_WFO, tool_wikipedia,
|
| 127 |
check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
|
| 128 |
|
|
@@ -131,7 +133,7 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
|
|
| 131 |
prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
|
| 132 |
path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
|
| 133 |
prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
|
| 134 |
-
OCR_option_llava_bit, double_OCR, save_cropped_annotations,
|
| 135 |
tool_GEO, tool_WFO, tool_wikipedia,
|
| 136 |
check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
|
| 137 |
|
|
@@ -183,6 +185,7 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
|
|
| 183 |
'OCR_option': OCR_option,
|
| 184 |
'OCR_option_llava': OCR_option_llava,
|
| 185 |
'OCR_option_llava_bit': OCR_option_llava_bit,
|
|
|
|
| 186 |
'double_OCR': double_OCR,
|
| 187 |
'pdf_conversion_dpi': pdf_conversion_dpi,
|
| 188 |
'tool_GEO': tool_GEO,
|
|
|
|
| 42 |
OCR_option = 'hand'
|
| 43 |
OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
|
| 44 |
OCR_option_llava_bit = 'full' # full or 4bit
|
| 45 |
+
OCR_GPT_4o_mini_resolution = 'high'
|
| 46 |
double_OCR = False
|
| 47 |
|
| 48 |
tool_GEO = True
|
|
|
|
| 74 |
prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
|
| 75 |
path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
|
| 76 |
prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
|
| 77 |
+
OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
|
| 78 |
tool_GEO, tool_WFO, tool_wikipedia,
|
| 79 |
check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
|
| 80 |
else:
|
|
|
|
| 96 |
OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
|
| 97 |
OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
|
| 98 |
OCR_option_llava_bit = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
|
| 99 |
+
OCR_GPT_4o_mini_resolution = loaded_cfg['leafmachine']['project']['OCR_GPT_4o_mini_resolution']
|
| 100 |
double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
|
| 101 |
|
| 102 |
tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
|
|
|
|
| 124 |
prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
|
| 125 |
path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
|
| 126 |
prompt_version, do_create_OCR_helper_image, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
|
| 127 |
+
OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
|
| 128 |
tool_GEO, tool_WFO, tool_wikipedia,
|
| 129 |
check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
|
| 130 |
|
|
|
|
| 133 |
prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
|
| 134 |
path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
|
| 135 |
prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, do_use_florence, trOCR_model_path, florence_model_path, OCR_option, OCR_option_llava,
|
| 136 |
+
OCR_option_llava_bit, OCR_GPT_4o_mini_resolution, double_OCR, save_cropped_annotations,
|
| 137 |
tool_GEO, tool_WFO, tool_wikipedia,
|
| 138 |
check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
|
| 139 |
|
|
|
|
| 185 |
'OCR_option': OCR_option,
|
| 186 |
'OCR_option_llava': OCR_option_llava,
|
| 187 |
'OCR_option_llava_bit': OCR_option_llava_bit,
|
| 188 |
+
'OCR_GPT_4o_mini_resolution': OCR_GPT_4o_mini_resolution,
|
| 189 |
'double_OCR': double_OCR,
|
| 190 |
'pdf_conversion_dpi': pdf_conversion_dpi,
|
| 191 |
'tool_GEO': tool_GEO,
|
vouchervision/general_utils.py
CHANGED
|
@@ -10,7 +10,11 @@ import concurrent.futures
|
|
| 10 |
from time import perf_counter
|
| 11 |
import torch
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
'''
|
| 16 |
TIFF --> DNG
|
|
@@ -65,12 +69,12 @@ def add_to_expense_report(dir_home, data):
|
|
| 65 |
|
| 66 |
# If the file does not exist, write the header first
|
| 67 |
if not file_exists:
|
| 68 |
-
writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out',])
|
| 69 |
|
| 70 |
# Write the data row
|
| 71 |
writer.writerow(data)
|
| 72 |
|
| 73 |
-
def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, total_tokens_out, n_images, dir_home, logger):
|
| 74 |
if path_api_cost:
|
| 75 |
LLM_version = ModelMaps.get_version_mapping_cost(LLM_version0)
|
| 76 |
|
|
@@ -78,16 +82,18 @@ def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, t
|
|
| 78 |
csv_file_path = os.path.join(Dirs.path_cost, Dirs.run_name + '.csv')
|
| 79 |
|
| 80 |
cost_in, cost_out, total_cost, rate_in, rate_out = calculate_cost(LLM_version, path_api_cost, total_tokens_in, total_tokens_out)
|
|
|
|
|
|
|
| 81 |
|
| 82 |
# The data to be written to the CSV file
|
| 83 |
-
data = [Dirs.run_name, get_datetime(),LLM_version, total_cost, n_images, total_tokens_in, total_tokens_out, rate_in, rate_out, cost_in, cost_out,]
|
| 84 |
|
| 85 |
# Open the file in write mode
|
| 86 |
with open(csv_file_path, mode='w', newline='') as file:
|
| 87 |
writer = csv.writer(file)
|
| 88 |
|
| 89 |
# Write the header
|
| 90 |
-
writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out',])
|
| 91 |
|
| 92 |
# Write the data
|
| 93 |
writer.writerow(data)
|
|
@@ -119,6 +125,11 @@ def summarize_expense_report(path_expense_report):
|
|
| 119 |
cost_in_sum = 0
|
| 120 |
cost_out_sum = 0
|
| 121 |
n_images_sum = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
api_version_counts = Counter()
|
| 123 |
|
| 124 |
# Try to read the CSV file into a DataFrame
|
|
@@ -128,7 +139,7 @@ def summarize_expense_report(path_expense_report):
|
|
| 128 |
# Process each row in the DataFrame
|
| 129 |
for index, row in df.iterrows():
|
| 130 |
run_count += 1
|
| 131 |
-
total_cost_sum += row['total_cost']
|
| 132 |
tokens_in_sum += row['tokens_in']
|
| 133 |
tokens_out_sum += row['tokens_out']
|
| 134 |
rate_in_sum += row['rate_in']
|
|
@@ -136,6 +147,9 @@ def summarize_expense_report(path_expense_report):
|
|
| 136 |
cost_in_sum += row['cost_in']
|
| 137 |
cost_out_sum += row['cost_out']
|
| 138 |
n_images_sum += row['n_images']
|
|
|
|
|
|
|
|
|
|
| 139 |
api_version_counts[row['api_version']] += 1
|
| 140 |
|
| 141 |
except FileNotFoundError:
|
|
@@ -163,6 +177,9 @@ def summarize_expense_report(path_expense_report):
|
|
| 163 |
'rate_out_sum': rate_out_sum,
|
| 164 |
'cost_in_sum': cost_in_sum,
|
| 165 |
'cost_out_sum': cost_out_sum,
|
|
|
|
|
|
|
|
|
|
| 166 |
'n_images_sum':n_images_sum,
|
| 167 |
'api_version_percentages': api_version_percentages,
|
| 168 |
'cost_per_image': cost_per_image_dict
|
|
|
|
| 10 |
from time import perf_counter
|
| 11 |
import torch
|
| 12 |
|
| 13 |
+
try:
|
| 14 |
+
from vouchervision.model_maps import ModelMaps
|
| 15 |
+
except:
|
| 16 |
+
from model_maps import ModelMaps
|
| 17 |
+
|
| 18 |
|
| 19 |
'''
|
| 20 |
TIFF --> DNG
|
|
|
|
| 69 |
|
| 70 |
# If the file does not exist, write the header first
|
| 71 |
if not file_exists:
|
| 72 |
+
writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out','ocr_cost','ocr_tokens_in', 'ocr_tokens_out',])
|
| 73 |
|
| 74 |
# Write the data row
|
| 75 |
writer.writerow(data)
|
| 76 |
|
| 77 |
+
def save_token_info_as_csv(Dirs, LLM_version0, path_api_cost, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out, n_images, dir_home, logger):
|
| 78 |
if path_api_cost:
|
| 79 |
LLM_version = ModelMaps.get_version_mapping_cost(LLM_version0)
|
| 80 |
|
|
|
|
| 82 |
csv_file_path = os.path.join(Dirs.path_cost, Dirs.run_name + '.csv')
|
| 83 |
|
| 84 |
cost_in, cost_out, total_cost, rate_in, rate_out = calculate_cost(LLM_version, path_api_cost, total_tokens_in, total_tokens_out)
|
| 85 |
+
|
| 86 |
+
total_cost += OCR_cost
|
| 87 |
|
| 88 |
# The data to be written to the CSV file
|
| 89 |
+
data = [Dirs.run_name, get_datetime(),LLM_version, total_cost, n_images, total_tokens_in, total_tokens_out, rate_in, rate_out, cost_in, cost_out,OCR_cost, OCR_tokens_in, OCR_tokens_out,]
|
| 90 |
|
| 91 |
# Open the file in write mode
|
| 92 |
with open(csv_file_path, mode='w', newline='') as file:
|
| 93 |
writer = csv.writer(file)
|
| 94 |
|
| 95 |
# Write the header
|
| 96 |
+
writer.writerow(['run','date','api_version','total_cost', 'n_images', 'tokens_in', 'tokens_out', 'rate_in', 'rate_out', 'cost_in', 'cost_out','ocr_cost','ocr_tokens_in', 'ocr_tokens_out'])
|
| 97 |
|
| 98 |
# Write the data
|
| 99 |
writer.writerow(data)
|
|
|
|
| 125 |
cost_in_sum = 0
|
| 126 |
cost_out_sum = 0
|
| 127 |
n_images_sum = 0
|
| 128 |
+
# ,'ocr_cost','ocr_tokens_in', 'ocr_tokens_out'
|
| 129 |
+
ocr_cost_sum = 0
|
| 130 |
+
ocr_tokens_in_sum = 0
|
| 131 |
+
ocr_tokens_out_sum = 0
|
| 132 |
+
|
| 133 |
api_version_counts = Counter()
|
| 134 |
|
| 135 |
# Try to read the CSV file into a DataFrame
|
|
|
|
| 139 |
# Process each row in the DataFrame
|
| 140 |
for index, row in df.iterrows():
|
| 141 |
run_count += 1
|
| 142 |
+
total_cost_sum += row['total_cost'] + row['ocr_cost']
|
| 143 |
tokens_in_sum += row['tokens_in']
|
| 144 |
tokens_out_sum += row['tokens_out']
|
| 145 |
rate_in_sum += row['rate_in']
|
|
|
|
| 147 |
cost_in_sum += row['cost_in']
|
| 148 |
cost_out_sum += row['cost_out']
|
| 149 |
n_images_sum += row['n_images']
|
| 150 |
+
ocr_cost_sum += row['ocr_cost']
|
| 151 |
+
ocr_tokens_in_sum += row['ocr_tokens_in']
|
| 152 |
+
ocr_tokens_out_sum += row['ocr_tokens_out']
|
| 153 |
api_version_counts[row['api_version']] += 1
|
| 154 |
|
| 155 |
except FileNotFoundError:
|
|
|
|
| 177 |
'rate_out_sum': rate_out_sum,
|
| 178 |
'cost_in_sum': cost_in_sum,
|
| 179 |
'cost_out_sum': cost_out_sum,
|
| 180 |
+
'ocr_cost_sum': ocr_cost_sum,
|
| 181 |
+
'ocr_tokens_in_sum': ocr_tokens_in_sum,
|
| 182 |
+
'ocr_tokens_out_sum': ocr_tokens_out_sum,
|
| 183 |
'n_images_sum':n_images_sum,
|
| 184 |
'api_version_percentages': api_version_percentages,
|
| 185 |
'cost_per_image': cost_per_image_dict
|
vouchervision/model_maps.py
CHANGED
|
@@ -40,23 +40,27 @@ class ModelMaps:
|
|
| 40 |
'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05': '#bababa', # Gray
|
| 41 |
}
|
| 42 |
|
| 43 |
-
MODELS_OPENAI = [
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# "Azure GPT 4 32k",
|
| 55 |
# "Azure GPT 4 Turbo 0125-preview",
|
| 56 |
# "Azure GPT 4 Turbo 1106-preview",
|
| 57 |
# "Azure GPT 3.5 Turbo",
|
| 58 |
# "Azure GPT 3.5 Instruct",
|
| 59 |
-
|
| 60 |
|
| 61 |
MODELS_GOOGLE = [
|
| 62 |
# "PaLM 2 text-bison@001",
|
|
@@ -79,7 +83,14 @@ class ModelMaps:
|
|
| 79 |
"LOCAL CPU Mistral 7B Instruct v0.2 GGUF",
|
| 80 |
'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05']
|
| 81 |
|
| 82 |
-
MODELS_GUI_DEFAULT = "Azure GPT 4" # "GPT 4 Turbo 1106-preview"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
version_mapping_cost = {
|
| 85 |
'GPT 4 32k': 'GPT_4_32K',
|
|
@@ -316,7 +327,16 @@ class ModelMaps:
|
|
| 316 |
|
| 317 |
@classmethod
|
| 318 |
def get_models_gui_list(cls):
|
| 319 |
-
return cls.MODELS_LOCAL + cls.MODELS_GOOGLE + cls.MODELS_OPENAI + cls.MODELS_MISTRAL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
@classmethod
|
| 322 |
def get_version_mapping_cost(cls, key):
|
|
|
|
| 40 |
'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05': '#bababa', # Gray
|
| 41 |
}
|
| 42 |
|
| 43 |
+
MODELS_OPENAI = [
|
| 44 |
+
"GPT 4o 2024-05-13", #GPT_4o_2024_05_13
|
| 45 |
+
"GPT 4o mini 2024-07-18",
|
| 46 |
+
"GPT 4 Turbo 2024-04-09",#GPT_4_TURBO_2024_04_09
|
| 47 |
+
"GPT 4",
|
| 48 |
+
"GPT 4 32k",
|
| 49 |
+
"GPT 4 Turbo 0125-preview",
|
| 50 |
+
"GPT 4 Turbo 1106-preview",
|
| 51 |
+
"GPT 3.5 Turbo",
|
| 52 |
+
"GPT 3.5 Instruct",
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
MODELS_OPENAI_AZURE = [
|
| 57 |
+
"Azure GPT 4",
|
| 58 |
# "Azure GPT 4 32k",
|
| 59 |
# "Azure GPT 4 Turbo 0125-preview",
|
| 60 |
# "Azure GPT 4 Turbo 1106-preview",
|
| 61 |
# "Azure GPT 3.5 Turbo",
|
| 62 |
# "Azure GPT 3.5 Instruct",
|
| 63 |
+
]
|
| 64 |
|
| 65 |
MODELS_GOOGLE = [
|
| 66 |
# "PaLM 2 text-bison@001",
|
|
|
|
| 83 |
"LOCAL CPU Mistral 7B Instruct v0.2 GGUF",
|
| 84 |
'phyloforfun/mistral-7b-instruct-v2-bnb-4bit__HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05']
|
| 85 |
|
| 86 |
+
MODELS_GUI_DEFAULT = "Gemini 1.5 Flash" #"Azure GPT 4" # "GPT 4 Turbo 1106-preview"
|
| 87 |
+
|
| 88 |
+
MODEL_FAMILY = {
|
| 89 |
+
'OpenAI': MODELS_OPENAI,
|
| 90 |
+
'OpenAI Azure': MODELS_OPENAI_AZURE,
|
| 91 |
+
'Google': MODELS_GOOGLE,
|
| 92 |
+
'Mistral': MODELS_MISTRAL,
|
| 93 |
+
'Local': MODELS_LOCAL}
|
| 94 |
|
| 95 |
version_mapping_cost = {
|
| 96 |
'GPT 4 32k': 'GPT_4_32K',
|
|
|
|
| 327 |
|
| 328 |
@classmethod
|
| 329 |
def get_models_gui_list(cls):
|
| 330 |
+
return cls.MODELS_LOCAL + cls.MODELS_GOOGLE + cls.MODELS_OPENAI + cls.MODELS_OPENAI_AZURE + cls.MODELS_MISTRAL
|
| 331 |
+
|
| 332 |
+
@classmethod
|
| 333 |
+
def get_models_gui_list_family(cls, family=None):
|
| 334 |
+
if family and family in cls.MODEL_FAMILY:
|
| 335 |
+
return cls.MODEL_FAMILY[family]
|
| 336 |
+
all_models = []
|
| 337 |
+
for family_models in cls.MODEL_FAMILY.values():
|
| 338 |
+
all_models.extend(family_models)
|
| 339 |
+
return all_models
|
| 340 |
|
| 341 |
@classmethod
|
| 342 |
def get_version_mapping_cost(cls, key):
|
vouchervision/utils_LLM.py
CHANGED
|
@@ -8,11 +8,16 @@ import psutil
|
|
| 8 |
import threading
|
| 9 |
import torch
|
| 10 |
from datetime import datetime
|
| 11 |
-
from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
|
| 12 |
-
from vouchervision.tool_geolocate_HERE import validate_coordinates_here
|
| 13 |
-
from vouchervision.tool_wikipedia import validate_wikipedia
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
|
| 18 |
# Define a function that will catch and return the results of your functions
|
|
@@ -179,15 +184,26 @@ class SystemLoadMonitor():
|
|
| 179 |
|
| 180 |
}
|
| 181 |
|
| 182 |
-
self.logger
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
if self.has_GPU:
|
| 187 |
report.update({'max_gpu_load': str(round(self.gpu_usage['max_load'] * 100, 2))})
|
| 188 |
report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'], 2))})
|
| 189 |
-
self.logger
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
else:
|
| 192 |
report.update({'max_gpu_load': '0'})
|
| 193 |
report.update({'max_gpu_vram_gb': '0'})
|
|
|
|
| 8 |
import threading
|
| 9 |
import torch
|
| 10 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
| 11 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 12 |
|
| 13 |
+
try:
|
| 14 |
+
from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
|
| 15 |
+
from vouchervision.tool_geolocate_HERE import validate_coordinates_here
|
| 16 |
+
from vouchervision.tool_wikipedia import validate_wikipedia
|
| 17 |
+
except:
|
| 18 |
+
from tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
|
| 19 |
+
from tool_geolocate_HERE import validate_coordinates_here
|
| 20 |
+
from tool_wikipedia import validate_wikipedia
|
| 21 |
|
| 22 |
def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
|
| 23 |
# Define a function that will catch and return the results of your functions
|
|
|
|
| 184 |
|
| 185 |
}
|
| 186 |
|
| 187 |
+
if self.logger:
|
| 188 |
+
self.logger.info(f"Inference Time: {round(self.inference_time,2)} seconds")
|
| 189 |
+
self.logger.info(f"Tool Time: {round(tool_time,2)} seconds")
|
| 190 |
+
self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
|
| 191 |
+
self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
|
| 192 |
+
else:
|
| 193 |
+
print(f"Inference Time: {round(self.inference_time,2)} seconds")
|
| 194 |
+
print(f"Tool Time: {round(tool_time,2)} seconds")
|
| 195 |
+
print(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
|
| 196 |
+
print(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
|
| 197 |
+
|
| 198 |
if self.has_GPU:
|
| 199 |
report.update({'max_gpu_load': str(round(self.gpu_usage['max_load'] * 100, 2))})
|
| 200 |
report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'], 2))})
|
| 201 |
+
if self.logger:
|
| 202 |
+
self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
|
| 203 |
+
self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
|
| 204 |
+
else:
|
| 205 |
+
print(f"Max GPU Load: {round(self.gpu_usage['max_load'] * 100, 2)}%")
|
| 206 |
+
print(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'], 2)}GB")
|
| 207 |
else:
|
| 208 |
report.update({'max_gpu_load': '0'})
|
| 209 |
report.update({'max_gpu_vram_gb': '0'})
|
vouchervision/utils_VoucherVision.py
CHANGED
|
@@ -43,6 +43,10 @@ class VoucherVision():
|
|
| 43 |
self.prompt_version = None
|
| 44 |
self.is_hf = is_hf
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
|
| 47 |
self.config_vals_for_permutation = config_vals_for_permutation
|
| 48 |
|
|
@@ -649,11 +653,19 @@ class VoucherVision():
|
|
| 649 |
def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
|
| 650 |
self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
|
| 651 |
# self.OCR - None
|
|
|
|
|
|
|
|
|
|
| 652 |
|
| 653 |
### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
|
| 654 |
ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
|
| 655 |
ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
|
| 656 |
self.OCR = ocr_google.OCR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
|
| 658 |
|
| 659 |
self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
|
|
@@ -774,7 +786,8 @@ class VoucherVision():
|
|
| 774 |
|
| 775 |
self.update_progress_report_final(progress_report)
|
| 776 |
final_JSON_response = self.parse_final_json_response(final_JSON_response)
|
| 777 |
-
|
|
|
|
| 778 |
|
| 779 |
|
| 780 |
##################################################################################################################################
|
|
@@ -905,9 +918,9 @@ class VoucherVision():
|
|
| 905 |
if is_real_run:
|
| 906 |
progress_report.update_overall(f"Transcribing Labels")
|
| 907 |
|
| 908 |
-
final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
|
| 909 |
|
| 910 |
-
return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out
|
| 911 |
|
| 912 |
except Exception as e:
|
| 913 |
self.logger.error(f"LLM call failed in process_specimen_batch: {e}")
|
|
|
|
| 43 |
self.prompt_version = None
|
| 44 |
self.is_hf = is_hf
|
| 45 |
|
| 46 |
+
self.OCR_cost = 0.0
|
| 47 |
+
self.OCR_tokens_in = 0
|
| 48 |
+
self.OCR_tokens_out = 0
|
| 49 |
+
|
| 50 |
### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
|
| 51 |
self.config_vals_for_permutation = config_vals_for_permutation
|
| 52 |
|
|
|
|
| 653 |
def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
|
| 654 |
self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
|
| 655 |
# self.OCR - None
|
| 656 |
+
self.OCR_cost = 0.0
|
| 657 |
+
self.OCR_tokens_in = 0
|
| 658 |
+
self.OCR_tokens_out = 0
|
| 659 |
|
| 660 |
### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
|
| 661 |
ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
|
| 662 |
ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
|
| 663 |
self.OCR = ocr_google.OCR
|
| 664 |
+
|
| 665 |
+
self.OCR_cost = ocr_google.cost
|
| 666 |
+
self.OCR_tokens_in = ocr_google.tokens_in
|
| 667 |
+
self.OCR_tokens_out = ocr_google.tokens_out
|
| 668 |
+
|
| 669 |
self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
|
| 670 |
|
| 671 |
self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
|
|
|
|
| 786 |
|
| 787 |
self.update_progress_report_final(progress_report)
|
| 788 |
final_JSON_response = self.parse_final_json_response(final_JSON_response)
|
| 789 |
+
|
| 790 |
+
return final_JSON_response, final_WFO_record, final_GEO_record, self.total_tokens_in, self.total_tokens_out, self.OCR_cost, self.OCR_tokens_in, self.OCR_tokens_out
|
| 791 |
|
| 792 |
|
| 793 |
##################################################################################################################################
|
|
|
|
| 918 |
if is_real_run:
|
| 919 |
progress_report.update_overall(f"Transcribing Labels")
|
| 920 |
|
| 921 |
+
final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
|
| 922 |
|
| 923 |
+
return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out
|
| 924 |
|
| 925 |
except Exception as e:
|
| 926 |
self.logger.error(f"LLM call failed in process_specimen_batch: {e}")
|
vouchervision/vouchervision_main.py
CHANGED
|
@@ -65,9 +65,9 @@ def voucher_vision(cfg_file_path, dir_home, path_custom_prompts, cfg_test, progr
|
|
| 65 |
# Process labels
|
| 66 |
Voucher_Vision = VoucherVision(cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf)
|
| 67 |
n_images = len(Voucher_Vision.img_paths)
|
| 68 |
-
last_JSON_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = Voucher_Vision.process_specimen_batch(progress_report, json_report, is_real_run)
|
| 69 |
|
| 70 |
-
total_cost = save_token_info_as_csv(Dirs, cfg['leafmachine']['LLM_version'], path_api_cost, total_tokens_in, total_tokens_out, n_images, dir_home, logger)
|
| 71 |
|
| 72 |
t_overall_s = perf_counter()
|
| 73 |
logger.name = 'Run Complete! :)'
|
|
|
|
| 65 |
# Process labels
|
| 66 |
Voucher_Vision = VoucherVision(cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf)
|
| 67 |
n_images = len(Voucher_Vision.img_paths)
|
| 68 |
+
last_JSON_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out = Voucher_Vision.process_specimen_batch(progress_report, json_report, is_real_run)
|
| 69 |
|
| 70 |
+
total_cost = save_token_info_as_csv(Dirs, cfg['leafmachine']['LLM_version'], path_api_cost, total_tokens_in, total_tokens_out, OCR_cost, OCR_tokens_in, OCR_tokens_out, n_images, dir_home, logger)
|
| 71 |
|
| 72 |
t_overall_s = perf_counter()
|
| 73 |
logger.name = 'Run Complete! :)'
|