Spaces:

markqiu
/

prinvest_mate

Sleeping

App Files Files Community

charles0519 commited on Mar 28, 2023

Commit

b741e5b

unverified ·

1 Parent(s): 3f820aa

feature: 优化上传索引文件时空格分割 (#427)

Browse files

* feature: 优化上传索引文件时空格分割

* feature: 优化上传索引文件时空格分割

Files changed (2) hide show

.gitignore +1 -0
modules/llama_func.py +32 -32

.gitignore CHANGED Viewed

@@ -136,3 +136,4 @@ dmypy.json
 api_key.txt
 auth.json

 api_key.txt
 auth.json
+.idea

modules/llama_func.py CHANGED Viewed

@@ -13,7 +13,6 @@ from llama_index import (
 from langchain.llms import OpenAI
 import colorama
 from modules.presets import *
 from modules.utils import *
@@ -30,37 +29,38 @@ def get_documents(file_src):
             logging.debug("Loading PDF...")
             CJKPDFReader = download_loader("CJKPDFReader")
             loader = CJKPDFReader()
-            documents += loader.load_data(file=file.name)
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
-            documents += loader.load_data(file=file.name)
         elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
-            documents += loader.load_data(file=file.name)
         else:
             logging.debug("Loading text file...")
             with open(file.name, "r", encoding="utf-8") as f:
-                text = add_space(f.read())
-                documents += [Document(text)]
     index_name = sha1sum(index_name)
     return documents, index_name
 def construct_index(
-    api_key,
-    file_src,
-    max_input_size=4096,
-    num_outputs=1,
-    max_chunk_overlap=20,
-    chunk_size_limit=600,
-    embedding_limit=None,
-    separator=" ",
-    num_children=10,
-    max_keywords_per_chunk=10,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
@@ -97,12 +97,12 @@ def construct_index(
 def chat_ai(
-    api_key,
-    index,
-    question,
-    context,
-    chatbot,
-    reply_language,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
@@ -133,15 +133,15 @@ def chat_ai(
 def ask_ai(
-    api_key,
-    index,
-    question,
-    prompt_tmpl,
-    refine_tmpl,
-    sim_k=1,
-    temprature=0,
-    prefix_messages=[],
-    reply_language="中文",
 ):
     os.environ["OPENAI_API_KEY"] = api_key
@@ -174,7 +174,7 @@ def ask_ai(
         for index, node in enumerate(response.source_nodes):
             brief = node.source_text[:25].replace("\n", "")
             nodes.append(
-                f"<details><summary>[{index+1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
             )
         new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
         logging.info(

 from langchain.llms import OpenAI
 import colorama
 from modules.presets import *
 from modules.utils import *
             logging.debug("Loading PDF...")
             CJKPDFReader = download_loader("CJKPDFReader")
             loader = CJKPDFReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         else:
             logging.debug("Loading text file...")
             with open(file.name, "r", encoding="utf-8") as f:
+                text_raw = f.read()
+        text = add_space(text_raw)
+        documents += [Document(text)]
     index_name = sha1sum(index_name)
     return documents, index_name
 def construct_index(
+        api_key,
+        file_src,
+        max_input_size=4096,
+        num_outputs=1,
+        max_chunk_overlap=20,
+        chunk_size_limit=600,
+        embedding_limit=None,
+        separator=" ",
+        num_children=10,
+        max_keywords_per_chunk=10,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
 def chat_ai(
+        api_key,
+        index,
+        question,
+        context,
+        chatbot,
+        reply_language,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
 def ask_ai(
+        api_key,
+        index,
+        question,
+        prompt_tmpl,
+        refine_tmpl,
+        sim_k=1,
+        temprature=0,
+        prefix_messages=[],
+        reply_language="中文",
 ):
     os.environ["OPENAI_API_KEY"] = api_key
         for index, node in enumerate(response.source_nodes):
             brief = node.source_text[:25].replace("\n", "")
             nodes.append(
+                f"<details><summary>[{index + 1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
             )
         new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
         logging.info(