Spaces:

markqiu
/

prinvest_mate

Sleeping

App Files Files Community

Tuchuanhuhuhu commited on Apr 4, 2023

Commit

c87878a

1 Parent(s): aebda89

支持Excel文件

Browse files

Files changed (3) hide show

modules/llama_func.py +18 -12
modules/utils.py +33 -0
requirements.txt +1 -0

modules/llama_func.py CHANGED Viewed

@@ -40,34 +40,40 @@ def get_documents(file_src):
     logging.debug("Loading documents...")
     logging.debug(f"file_src: {file_src}")
     for file in file_src:
-        logging.info(f"loading file: {file.name}")
-        if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
             try:
                 from modules.pdf_func import parse_pdf
                 from modules.config import advance_docs
                 two_column = advance_docs["pdf"].get("two_column", False)
-                pdftext = parse_pdf(file.name, two_column).text
             except:
                 pdftext = ""
-                with open(file.name, 'rb') as pdfFileObj:
                     pdfReader = PyPDF2.PdfReader(pdfFileObj)
                     for page in tqdm(pdfReader.pages):
                         pdftext += page.extract_text()
             text_raw = pdftext
-        elif os.path.splitext(file.name)[1] == ".docx":
-            logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
-            text_raw = loader.load_data(file=file.name)[0].text
-        elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
-            text_raw = loader.load_data(file=file.name)[0].text
         else:
             logging.debug("Loading text file...")
-            with open(file.name, "r", encoding="utf-8") as f:
                 text_raw = f.read()
         text = add_space(text_raw)
         # text = block_split(text)
@@ -89,7 +95,7 @@ def construct_index(
 ):
     from langchain.chat_models import ChatOpenAI
     from llama_index import GPTSimpleVectorIndex, ServiceContext
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
     embedding_limit = None if embedding_limit == 0 else embedding_limit
@@ -122,7 +128,7 @@ def construct_index(
             logging.error("索引构建失败！", e)
             print(e)
             return None
 def add_space(text):
     punctuations = {"，": "， ", "。": "。 ", "？": "？ ", "！": "！ ", "：": "： ", "；": "； "}

     logging.debug("Loading documents...")
     logging.debug(f"file_src: {file_src}")
     for file in file_src:
+        filepath = file.name
+        filename = os.path.basename(filepath)
+        file_type = os.path.splitext(filepath)[1]
+        logging.info(f"loading file: {filename}")
+        if file_type == ".pdf":
             logging.debug("Loading PDF...")
             try:
                 from modules.pdf_func import parse_pdf
                 from modules.config import advance_docs
                 two_column = advance_docs["pdf"].get("two_column", False)
+                pdftext = parse_pdf(filepath, two_column).text
             except:
                 pdftext = ""
+                with open(filepath, 'rb') as pdfFileObj:
                     pdfReader = PyPDF2.PdfReader(pdfFileObj)
                     for page in tqdm(pdfReader.pages):
                         pdftext += page.extract_text()
             text_raw = pdftext
+        elif file_type == ".docx":
+            logging.debug("Loading Word...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
+            text_raw = loader.load_data(file=filepath)[0].text
+        elif file_type == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
+            text_raw = loader.load_data(file=filepath)[0].text
+        elif file_type == ".xlsx":
+            logging.debug("Loading Excel...")
+            text_raw = excel_to_string(filepath)
         else:
             logging.debug("Loading text file...")
+            with open(filepath, "r", encoding="utf-8") as f:
                 text_raw = f.read()
         text = add_space(text_raw)
         # text = block_split(text)
 ):
     from langchain.chat_models import ChatOpenAI
     from llama_index import GPTSimpleVectorIndex, ServiceContext
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
     embedding_limit = None if embedding_limit == 0 else embedding_limit
             logging.error("索引构建失败！", e)
             print(e)
             return None
 def add_space(text):
     punctuations = {"，": "， ", "。": "。 ", "？": "？ ", "！": "！ ", "：": "： ", "；": "； "}

modules/utils.py CHANGED Viewed

@@ -21,6 +21,7 @@ from markdown import markdown
 from pygments import highlight
 from pygments.lexers import get_lexer_by_name
 from pygments.formatters import HtmlFormatter
 from modules.presets import *
 from . import shared
@@ -498,3 +499,35 @@ def add_details(lst):
             f"<details><summary>{brief}...</summary><p>{txt}</p></details>"
         )
     return nodes

 from pygments import highlight
 from pygments.lexers import get_lexer_by_name
 from pygments.formatters import HtmlFormatter
+import pandas as pd
 from modules.presets import *
 from . import shared
             f"<details><summary>{brief}...</summary><p>{txt}</p></details>"
         )
     return nodes
+def sheet_to_string(sheet):
+    result = ""
+    for index, row in sheet.iterrows():
+        row_string = ""
+        for column in sheet.columns:
+            row_string += f"{column}: {row[column]}, "
+        row_string = row_string.rstrip(", ")
+        row_string += "."
+        result += row_string + "\n"
+    return result
+def excel_to_string(file_path):
+    # 读取Excel文件中的所有工作表
+    excel_file = pd.read_excel(file_path, engine='openpyxl', sheet_name=None)
+    # 初始化结果字符串
+    result = ""
+    # 遍历每一个工作表
+    for sheet_name, sheet_data in excel_file.items():
+        # 将工作表名称添加到结果字符串
+        result += f"Sheet: {sheet_name}\n"
+        # 处理当前工作表并添加到结果字符串
+        result += sheet_to_string(sheet_data)
+        # 在不同工作表之间添加分隔符
+        result += "\n" + ("-" * 20) + "\n\n"
+    return result

requirements.txt CHANGED Viewed

@@ -12,3 +12,4 @@ langchain
 markdown
 PyPDF2
 pdfplumber

 markdown
 PyPDF2
 pdfplumber
+pandas