Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import fitz | |
| import re | |
| import os | |
| import requests | |
| months = """มกราคม | |
| กุมภาพันธ์ | |
| มีนาคม | |
| เมษายน | |
| พฤษภาคม | |
| มิถุนายน | |
| กรกฎาคม | |
| สิงหาคม | |
| กันยายน | |
| ตุลาคม | |
| พฤศจิกายน | |
| ธันวาคม""".split('\n') | |
| months = {m.strip():i for i,m in enumerate(months)} | |
| def download_pdf(url): | |
| """ | |
| Downloads a PDF file from a given URL and saves it to the local filesystem. If the file already exists on the local | |
| filesystem, the function returns the path to the existing file instead of downloading it again. | |
| Args: | |
| url (str): The URL of the PDF file to download. | |
| Returns: | |
| str: The path to the downloaded file on the local filesystem. | |
| """ | |
| # Extract the filename from the URL | |
| filename = url.split("/")[-1] | |
| # Check if the file already exists on the local filesystem | |
| if os.path.exists(filename): | |
| # If it does, return the path to the existing file | |
| return os.path.abspath(filename) | |
| # If the file doesn't exist, download it from the URL | |
| response = requests.get(url) | |
| # Save the downloaded file to the local filesystem | |
| with open(filename, "wb") as f: | |
| f.write(response.content) | |
| # Return the path to the downloaded file | |
| return os.path.abspath(filename) | |
| def greet(pdf_file: gr.File, pdf_url: str, replacer_string): | |
| if pdf_file is None and pdf_url is None: | |
| return "# Please updload file or link to ratchakitcha file", "Please add file" | |
| if not replacer_string: | |
| replacer_string = ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9' | |
| if pdf_file: | |
| pdf_path = pdf_file.name | |
| else: | |
| pdf_path = download_pdf(pdf_url) | |
| doc = fitz.open(pdf_path) | |
| md_string = read_lines(doc) | |
| replacer = re.findall('(([^=]*)=([^,]*),?)', replacer_string) | |
| for g, s_from, s_to in replacer: | |
| md_string = md_string.replace(s_from, s_to) | |
| md_string = get_metainfo(md_string) | |
| return md_string, md_string | |
| def get_metainfo(md_string): | |
| pattern = 'หน้า \d+\s+เล่ม (\d+) ตอน[^\s]+ (\d+) ([ก-ฮ]+) ราชกิจจานุเบกษา (\d+) ([^\s]+) (\d+)\s+' | |
| info = re.findall(pattern, md_string) | |
| pattern = '(หน้า \d+\s+เล่ม \d+ ตอน[^\s]+ \d+ [ก-ฮ]+ ราชกิจจานุเบกษา \d+ [^\s]+ \d+)\s+' | |
| if not info: return md_string | |
| info = [i for i in info[0]] | |
| info[4] = months.get(info[4], info[4]) | |
| md_string = re.sub(pattern, r'\n[//]: # (\1)\n\n', md_string) | |
| md_string = """--- | |
| เล่ม: {} | |
| ตอนที่: {} | |
| ประเภท: {} | |
| date: {}-{}-{} | |
| --- | |
| """.format(*info) + md_string | |
| return md_string | |
| def read_lines(doc): | |
| lines = '' | |
| for page in doc.pages(): | |
| words = page.get_text_words() | |
| words.sort(key=lambda x: (x[1], x[0])) | |
| curr_y = 0 | |
| drawings = page.get_drawings() | |
| is_header = True | |
| for x0,y0,x1,y1,text, _, _, _ in words: | |
| if y0 != curr_y: | |
| if is_header: | |
| lines += '\n' | |
| elif x0 > 100: | |
| lines += '\n\n' | |
| for l in drawings: | |
| r = l['rect'] | |
| if curr_y < r.y0 < y0: | |
| lines += '\n----\n\n'; drawings = []; is_header = False; break; | |
| lines += text.strip() + ' ' | |
| curr_y = y0 | |
| lines += '\n' | |
| return lines | |
| demo = gr.Interface(fn=greet, | |
| inputs=[gr.File(), gr.Text(), gr.Text(interactive=True)], | |
| outputs=[gr.TextArea(), gr.Markdown()], | |
| examples=[[None, | |
| 'https://ratchakitcha.soc.go.th/documents/140A014N0000000002600.pdf', | |
| ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9' | |
| ]]) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |