Spaces:

ZehaoLiu
/

QwenRAG

Running

App Files Files Community

MarshallCN commited on 11 days ago

Commit

bd5ce6f

1 Parent(s): b943737

init

Browse files

Files changed (6) hide show

.gitignore +15 -0
Chat_RAG_vecDB.py +212 -0
README.md +1 -1
ggufv2.py +412 -0
requirements.txt +4 -0
utils.py +151 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Jupyter
+**/.ipynb_checkpoints/
+.ipynb_*            # any hidden Jupyter aux files like .ipynb_foo
+# Python cache/bytecode
+**/__pycache__/
+*.py[cod]
+*$py.class
+/old/
+/old/*
+models/
+models/*
+export/
+msgs/
+msgs/*

Chat_RAG_vecDB.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import gradio as gr
+import stat
+import os, shutil, pickle, torch, json, hashlib
+import faiss, numpy as np
+from pathlib import Path
+from FlagEmbedding import BGEM3FlagModel
+from sentence_transformers import CrossEncoder
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
+from langchain_community.document_loaders import TextLoader, PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from utils import mk_msg_dir
+# === 模型加载 ===
+if gr.NO_RELOAD:
+    BASE_DIR = r"C:\Users\c1052689\hug_models\Qwen2.5-0.5B-Instruct"
+    tok = AutoTokenizer.from_pretrained(BASE_DIR, use_fast=False, local_files_only=True)
+    bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
+                         bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
+    model = AutoModelForCausalLM.from_pretrained(BASE_DIR, quantization_config=bnb, device_map="auto", local_files_only=True)
+    pipe = pipeline("text-generation", model=model, tokenizer=tok, max_new_tokens=512)
+    BGEM3 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
+    reranker = CrossEncoder("BAAI/bge-reranker-large")
+# === 向量库全局变量 ===
+corpus = []
+index = None
+current_db_dir = None
+vec_dir_base = './vectorstore/bgem3/'
+embedding_model_id = 'BAAI/bge-m3'
+# === 文档加载 & 向量构建 ===
+def load_documents(folder: str):
+    docs = []
+    for path in Path(folder).rglob("*"):
+        if path.suffix == ".txt":
+            docs += TextLoader(str(path), encoding="utf-8").load()
+        elif path.suffix == ".pdf":
+            docs += PyPDFLoader(str(path)).load()
+    return docs
+def split_docs(docs, chunk_size=512, chunk_overlap=64):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    return splitter.split_documents(docs)
+def on_rm_error(func, path, exc_info):
+    os.chmod(path, stat.S_IWRITE)
+    func(path)
+def hash_text(text):
+    return hashlib.md5(text.encode('utf-8')).hexdigest()
+def list_vector_dbs():
+    db_list = [f.name for f in Path(vec_dir_base).iterdir() if f.is_dir()]
+    return ["<New Vector DB>"] + db_list
+def create_or_extend_index(docs, selected_db):
+    global corpus, index, current_db_dir
+    temp_dir = "temp_docs"
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir, onerror=on_rm_error)
+    os.makedirs(temp_dir, exist_ok=True)
+    for file in docs:
+        src_path = file.name if hasattr(file, "name") else str(file)
+        dst_path = os.path.join(temp_dir, os.path.basename(src_path))
+        shutil.copy(src_path, dst_path)
+    raw_docs = load_documents(temp_dir)
+    chunks = split_docs(raw_docs)
+    new_corpus = [t.page_content for t in chunks]
+    new_hashes = [hash_text(t) for t in new_corpus]
+    if selected_db == "<New Vector DB>":
+        db_id = mk_msg_dir(Path(vec_dir_base))
+        current_db_dir = os.path.join(vec_dir_base, db_id)
+        os.makedirs(current_db_dir, exist_ok=True)
+        index = faiss.IndexFlatIP(BGEM3.encode(["test"])["dense_vecs"].shape[1])
+        corpus = []
+        existing_hashes = set()
+    else:
+        current_db_dir = os.path.join(vec_dir_base, selected_db)
+        index = faiss.read_index(os.path.join(current_db_dir, "index.faiss"))
+        with open(os.path.join(current_db_dir, "corpus.pkl"), "rb") as f:
+            corpus = pickle.load(f)
+        with open(os.path.join(current_db_dir, "meta.json"), "r", encoding="utf-8") as f:
+            meta = json.load(f)
+            existing_hashes = set(meta.get("hashes", []))
+    # 去重
+    filtered = [(c, h) for c, h in zip(new_corpus, new_hashes) if h not in existing_hashes]
+    if not filtered:
+        return "✅ No new (non-duplicate) chunks to add."
+    add_corpus, add_hashes = zip(*filtered)
+    dense = BGEM3.encode(add_corpus, batch_size=64)["dense_vecs"]
+    if isinstance(dense, torch.Tensor):
+        dense = dense.detach().cpu().numpy()
+    dense = np.ascontiguousarray(dense, dtype=np.float32)
+    faiss.normalize_L2(dense)
+    index.add(dense)
+    corpus.extend(add_corpus)
+    all_hashes = list(existing_hashes) + list(add_hashes)
+    faiss.write_index(index, os.path.join(current_db_dir, "index.faiss"))
+    with open(os.path.join(current_db_dir, "corpus.pkl"), "wb") as f:
+        pickle.dump(corpus, f)
+    meta = {
+        "model": embedding_model_id,
+        "dim": int(dense.shape[1]),
+        "total_chunks": len(corpus),
+        "raw_docs": len(raw_docs),
+        "hashes": all_hashes,
+    }
+    with open(os.path.join(current_db_dir, "meta.json"), "w", encoding="utf-8") as f:
+        json.dump(meta, f, indent=2)
+    db_stats = f"✅ Added {len(add_corpus)} new chunks to DB `{os.path.basename(current_db_dir)}`."
+    db_list_update = gr.update(choices=list_vector_dbs())
+    return db_stats, db_list_update
+def build_prompt_corpus(top_docs, question):
+    context_text = "\n\n".join(top_docs)
+    user_prompt = f"""Answer the question based on the following context:
+    {context_text}
+    Question: {question}
+    Answer:"""
+    full_prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return full_prompt
+def ask_question(query):
+    if not query.strip():
+        return "❌ Please enter your questions", [], ""
+    if index is None or len(corpus) == 0:
+        return "⚠️ Please upload your documents", [], ""
+    qv = np.array(BGEM3.encode([query])["dense_vecs"], dtype="float32")
+    faiss.normalize_L2(qv)
+    D, I = index.search(qv, 8)
+    results = [corpus[i] for i in I[0]]
+    pairs = [[query, c] for c in results]
+    scores = reranker.predict(pairs)
+    top_docs = [c for _, c in sorted(zip(scores, results), reverse=True)][:3]
+    prompt = build_prompt_corpus(top_docs, query)
+    out = pipe(
+        prompt,
+        max_new_tokens=1024,
+        eos_token_id=tok.eos_token_id,
+        pad_token_id=tok.eos_token_id,
+        return_full_text=False,
+    )
+    reply = out[0]["generated_text"]
+    context_display = "\n\n".join(
+        f"[{i+1}] {doc.strip()[:1000]}" for i, doc in enumerate(top_docs)
+    )
+    return reply.strip(), context_display
+def show_db_stats(selected_db):
+    if selected_db == "<New Vector DB>":
+        return "🆕 New vector DB will be created on next upload."
+    try:
+        db_dir = os.path.join(vec_dir_base, selected_db)
+        with open(os.path.join(db_dir, "meta.json"), "r", encoding="utf-8") as f:
+            meta = json.load(f)
+            chunk_num = int(meta.get("total_chunks", 0))
+            docs_num = int(meta.get("raw_docs", 0))
+        return f"📊 DB `{selected_db}`: {docs_num} docs, {chunk_num} chunks"
+    except Exception as e:
+        return f"⚠️ Failed to load DB `{selected_db}`: {e}"
+with gr.Blocks(title="Qwen2.5 RAG Chat") as demo:
+    gr.Markdown("## 🧠 Qwen2.5 BGEM3-RAG QA")
+    with gr.Row():
+        with gr.Column():
+            file_box = gr.File(label="Upload documents (PDF or TXT)", file_types=[".pdf", ".txt"], file_count="multiple")
+            db_selector = gr.Dropdown(label="Select or create vector DB", choices=list_vector_dbs(), value="<New Vector DB>")
+            upload_btn = gr.Button("📚 Add to Vector DB")
+            status = gr.Textbox(label="Status")
+        with gr.Column():
+            query = gr.Textbox(label="Enter your questions")
+            ask_btn = gr.Button("Send")
+            answer = gr.Textbox(label="🧠 Answer", lines=5)
+            context = gr.Textbox(
+                label="📄 Top-3 Reference Contexts",
+                lines=10,
+                interactive=False,
+                show_copy_button=True,
+                max_lines=20
+            )
+    db_selector.change(fn=show_db_stats, inputs=db_selector, outputs=status)
+    upload_btn.click(fn=create_or_extend_index, inputs=[file_box, db_selector], outputs=[status, db_selector])
+    ask_btn.click(fn=ask_question, inputs=query, outputs=[answer, context])
+    demo.load(
+        fn=lambda: gr.update(choices=list_vector_dbs()),
+        inputs=None,
+        outputs=db_selector
+    )
+if __name__ == '__main__':
+    demo.launch(debug=True)

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 license: mit
 short_description: Qwen2.5-0.5B-Q4 RAG demo

 colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
+app_file: ggufv2.py
 pinned: false
 license: mit
 short_description: Qwen2.5-0.5B-Q4 RAG demo

ggufv2.py ADDED Viewed

	@@ -0,0 +1,412 @@

+# gguf.py — Qwen GGUF chat with multi-session (load/save) via utils.py
+import os
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Optional, Tuple
+import shutil
+import gradio as gr
+from llama_cpp import Llama
+# Multi-session helpers from utils.py
+from utils import mk_msg_dir, _as_dir, persist_messages, trim_by_tokens
+# ===================== Model =====================
+# You can swap to another GGUF by changing repo_id/filename.
+model = Llama.from_pretrained(
+    repo_id="bartowski/Qwen2.5-0.5B-Instruct-GGUF",
+    filename="Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
+)
+assistant_name = "Nova"
+user_name = "Marshall"
+persona = f"""Your name is {assistant_name}. Address the user as "{user_name}". Use Markdown; put code in fenced blocks with a language tag.""".strip()
+# Where each conversation (session) persists its messages
+BASE_MSG_DIR = Path("./msgs/msgs_QwenGGUF")
+BASE_MSG_DIR.mkdir(parents=True, exist_ok=True)
+# ---------- Qwen chat template (no tools) ----------
+# def render_qwen(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> str:
+#     """
+#     Convert OpenAI-style messages to Qwen2.5 Instruct format:
+#       <|im_start|>system ... <|im_end|>
+#       <|im_start|>user ...   <|im_end|>
+#       <|im_start|>assistant  (generation continues here)
+#     """
+#     # System prompt
+#     if messages and messages[0].get("role") == "system":
+#         sys_txt = messages[0]["content"]
+#         rest = messages[1:]
+#     else:
+#         sys_txt = persona
+#         rest = messages
+#     parts = [f"<|im_start|>system\n{sys_txt}<|im_end|>\n"]
+#     for m in rest:
+#         role = m.get("role")
+#         if role not in ("user", "assistant"):
+#             continue
+#         parts.append(f"<|im_start|>{role}\n{m['content']}<|im_end|>\n")
+#     if add_generation_prompt:
+#         parts.append("<|im_start|>assistant\n")
+#     return "".join(parts)
+def render_qwen_trim(
+    messages: List[Dict[str, str]],
+    model,                             # llama_cpp.Llama 实例（用于 token 计数）
+    n_ctx: Optional[int] = None,       # 不传则用 model.n_ctx()
+    add_generation_prompt: bool = True,
+    persona: str = "",
+    reserve_new: int = 256,            # 希望生成的新 token 预算（上限）
+    pad: int = 8,                      # 保险余量，避免越界
+    hard_user_tail_chars: int = 2000,  # 还不够时，最后一条 user 文本的硬截断字符数
+) -> Tuple[str, int]:
+    """
+    - 只保留 system + 最近的若干轮对话，使得 total_tokens + reserve_new + pad <= n_ctx
+    - 若仍不够，则截短最后一条 user。
+    - 返回 (prompt, safe_max_new)，safe_max_new 已确保不越界。
+    """
+    def _tok_len(txt: str) -> int:
+        # 与 llama_cpp 的计数保持一致
+        return len(model.tokenize(txt.encode("utf-8"), add_bos=True))
+    if n_ctx is None:
+        n_ctx = getattr(model, "n_ctx")() if callable(getattr(model, "n_ctx", None)) else model.n_ctx
+    # 1) 拆出 system 与其余消息
+    if messages and messages[0].get("role") == "system":
+        sys_txt = messages[0]["content"]
+        rest = messages[1:]
+    else:
+        sys_txt = persona
+        rest = messages
+    # 仅保留 user / assistant
+    rest = [m for m in rest if m.get("role") in ("user", "assistant")]
+    # 2) 生成函数：把 system + 若干轮对话渲染为 Qwen prompt
+    def _render(sys_text: str, turns: List[Dict[str, str]], add_gen: bool) -> str:
+        parts = [f"<|im_start|>system\n{sys_text}<|im_end|>\n"]
+        for m in turns:
+            parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n")
+        if add_gen:
+            parts.append("<|im_start|>assistant\n")
+        return "".join(parts)
+    # 3) 先尝试保留全部轮次，从最老开始裁剪直到 fits
+    kept = rest[:]  # 深拷贝
+    while True:
+        prompt = _render(sys_txt, kept, add_generation_prompt)
+        used = _tok_len(prompt)
+        # 计算还能安全生成的 token 数
+        safe_max_new = max(1, n_ctx - used - pad)
+        # 希望生成 reserve_new，但不能超过 safe_max_new
+        if used + reserve_new + pad <= n_ctx:
+            # 有余量，按 reserve_new 返回可生成上限
+            return prompt, min(reserve_new, safe_max_new)
+        # 没有余量——需要裁剪历史。如果可裁剪的 turns < 1，则进入硬截断
+        if len(kept) <= 1:
+            break  # 只剩最后一条，跳出去做硬截断
+        # 从最早的一条开始丢；为避免打断成对语义，可一次丢两条（user+assistant）
+        # 但如果开头不是成对，就按 1 条丢弃。
+        drop_count = 2 if len(kept) >= 2 else 1
+        # 保证留下至少 1 条（最后一条 user）用于上下文
+        while drop_count > 0 and len(kept) > 1:
+            kept.pop(0)
+            drop_count -= 1
+    # 4) 仍然不够：硬截断“最后一条 user”文本尾部
+    #    目标：尽量保留最近语义，同时立刻释放 token 空间
+    if kept and kept[-1]["role"] == "user":
+        kept[-1] = {
+            "role": "user",
+            "content": kept[-1]["content"][-hard_user_tail_chars:]
+        }
+    elif kept:
+        # 最后一条不是 user，则尽量截短它（通常是 assistant）
+        kept[-1] = {
+            "role": kept[-1]["role"],
+            "content": kept[-1]["content"][-hard_user_tail_chars:]
+        }
+    # 重新渲染并最终给出安全 max_new
+    prompt = _render(sys_txt, kept, add_generation_prompt)
+    used = _tok_len(prompt)
+    safe_max_new = max(1, n_ctx - used - pad)
+    # 如果仍然超（极端长的 system），进一步把 system 也截短
+    if used + pad > n_ctx:
+        trimmed_sys = sys_txt[-hard_user_tail_chars:]
+        prompt = _render(trimmed_sys, kept, add_generation_prompt)
+        used = _tok_len(prompt)
+        safe_max_new = max(1, n_ctx - used - pad)
+    # 不允许返回负或 0
+    return prompt, max(1, safe_max_new)
+STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"]
+# ---------- Helpers for system + display ----------
+def ensure_system(messages: Optional[List[Dict[str, str]]], sys_prompt: str) -> List[Dict[str, str]]:
+    """Guarantee a system message at index 0 and keep it in sync with the UI textbox."""
+    sys_prompt = (sys_prompt or persona).strip()
+    if not messages or messages[0].get("role") != "system":
+        return [{"role": "system", "content": sys_prompt}]
+    messages = list(messages)
+    messages[0] = {"role": "system", "content": sys_prompt}
+    return messages
+def visible_chat(messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    """Hide system from chat display for gr.Chatbot(type='messages')."""
+    return [m for m in (messages or []) if m.get("role") in ("user", "assistant")]
+# ---------- Session I/O ----------
+def _load_latest(msg_id: str) -> List[Dict[str, str]]:
+    p = Path(_as_dir(BASE_MSG_DIR, msg_id), "trimmed.json")
+    if p.exists():
+        try:
+            return json.loads(p.read_text(encoding="utf-8"))
+        except Exception:
+            return []
+    return []
+def _init_sessions():
+    sessions = [p.name for p in BASE_MSG_DIR.iterdir() if p.is_dir()] if BASE_MSG_DIR.exists() else []
+    if len(sessions) == 0:
+        # No history
+        return gr.update(choices=[], value=None), [], "", [], []
+    sessions.sort(reverse=True)
+    msg_id = sessions[0]
+    messages = _load_latest(msg_id)
+    chat_hist = visible_chat(messages)
+    return gr.update(choices=sessions, value=msg_id), sessions, msg_id, messages, chat_hist
+def load_session(session_list, sessions):
+    msg_id = session_list
+    messages = _load_latest(msg_id)
+    chat_hist = visible_chat(messages)
+    return msg_id, messages, chat_hist, gr.update(choices=sessions, value=msg_id)
+def start_new_session(sessions):
+    msg_id = mk_msg_dir(BASE_MSG_DIR)
+    sessions = list(sessions or []) + [msg_id]
+    return [], [], "", msg_id, gr.update(choices=sessions, value=msg_id), sessions
+def _on_rm_error(func, path, exc_info):
+    try:
+        if os.name == "nt":                    # Windows
+            os.chmod(path, stat.S_IWRITE)      # 去掉只读
+        else:                                  # Linux / macOS
+            mode = os.stat(path).st_mode
+            os.chmod(
+                path,
+                mode | stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR  # 给所有者加 r,w,x
+            )
+        func(path)                              # 重试原操作，如 os.remove 或 os.rmdir
+    except Exception:
+        pass
+def delete_session(msg_id, sessions):
+    """Delete the currently selected session directory and refresh the list."""
+    # Remove directory for current session
+    if msg_id:
+        try:
+            shutil.rmtree(_as_dir(BASE_MSG_DIR, msg_id), onerror=_on_rm_error)
+        except Exception:
+            shutil.rmtree(_as_dir(BASE_MSG_DIR, msg_id), ignore_errors=True)
+    # Re-scan sessions on disk
+    if BASE_MSG_DIR.exists():
+        sess = [p.name for p in BASE_MSG_DIR.iterdir() if p.is_dir()]
+    else:
+        sess = []
+    sess.sort(reverse=True)
+    if sess:
+        new_id = sess[0]
+        msgs = _load_latest(new_id)
+        chat_hist = visible_chat(msgs)
+        return msgs, chat_hist, "", new_id, gr.update(choices=sess, value=new_id), sess
+    else:
+        return [], [], "", "", gr.update(choices=[], value=None), []
+def export_messages_to_json(messages, msg_id):
+    base = Path("/data/exports") if Path("/data").exists() else Path("./exports")
+    base.mkdir(parents=True, exist_ok=True)
+    stamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
+    fname = f"chat_{stamp}.json"
+    path = base / fname
+    path.write_text(json.dumps(messages or [], ensure_ascii=False, indent=2), encoding="utf-8")
+    return str(path)
+def on_click_download(messages, msg_id):
+    path = export_messages_to_json(messages, msg_id)
+    return gr.update(value=path, visible=True)
+# ---------- Generation callback ----------
+def on_send(user_text: str,
+            messages: List[Dict[str, str]],
+            msg_id: str,
+            sessions: List[str],
+            sys_prompt: str,
+            temperature: float,
+            top_p: float,
+            max_new_tokens: int,
+            repetition_penalty: float):
+    user_text = (user_text or "").strip()
+    if not user_text:
+        return gr.update(), messages, visible_chat(messages), msg_id, gr.update(choices=sessions, value=(msg_id or None)), sessions
+    # 1) ensure system
+    messages = ensure_system(messages, sys_prompt)
+    # 2) session bookkeeping
+    new_session = (len(messages) <= 1)  # only system exists
+    if new_session and not msg_id:
+        msg_id = mk_msg_dir(BASE_MSG_DIR)
+        sessions = list(sessions or []) + [msg_id]
+    if msg_id and msg_id not in (sessions or []):
+        sessions = list(sessions or []) + [msg_id]
+    sessions_update = gr.update(choices=sessions, value=msg_id)
+    # 3) append user, render, generate
+    messages = messages + [{"role": "user", "content": user_text}]
+    # prompt = render_qwen(messages, add_generation_prompt=True)
+    prompt, max_new = render_qwen_trim(
+        messages=messages,
+        model=model,        # llama_cpp.Llama 实例
+        n_ctx=None,         # 不传用 model.n_ctx()
+        add_generation_prompt=True,
+        persona=persona,    # 你之前的 persona 变量
+        reserve_new=max_new_tokens,  # 你希望的生成长度
+        pad=16
+    )
+    try:
+        result = model.create_completion(
+            prompt=prompt,
+            temperature=float(temperature),
+            top_p=float(top_p),
+            max_tokens=int(max_new),
+            repeat_penalty=float(repetition_penalty),
+            stop=STOP_TOKENS,
+        )
+        reply = result['choices'][0]['text'].strip()
+    except Exception:
+        _out = model(
+            prompt,
+            temperature=float(temperature),
+            top_p=float(top_p),
+            max_tokens=int(max_new),
+            repeat_penalty=float(repetition_penalty),
+            stop=STOP_TOKENS,
+        )
+        if isinstance(_out, dict):
+            reply = _out.get('choices', [{}])[0].get('text', '').strip()
+        else:
+            reply = str(_out).strip()
+    # 4) append assistant + persist
+    messages = messages + [{"role": "assistant", "content": reply}]
+    if msg_id:
+        msg_dir = _as_dir(BASE_MSG_DIR, msg_id)
+        persist_messages(messages, msg_dir, archive_last_turn=True)
+    return "", messages, visible_chat(messages), msg_id, sessions_update, sessions
+# ===================== UI =====================
+with gr.Blocks(title="Qwen GGUF — multi-session") as demo:
+    gr.Markdown("## 🧠 Qwen Chat")
+    with gr.Row():
+        with gr.Column(scale=3):
+            sys_prompt = gr.Textbox(
+                label="System prompt",
+                value=persona,
+                lines=6,
+                show_label=True,
+            )
+            with gr.Accordion("Generation settings", open=False):
+                temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
+                top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
+                max_new_tokens = gr.Slider(16, 512, value=256, step=16, label="max_new_tokens")
+                repetition_penalty = gr.Slider(1.0, 2.0, value=1.07, step=0.01, label="repetition_penalty")
+            session_list = gr.Radio(choices=[], value=None, label="Conversations", interactive=True)
+            new_btn = gr.Button("New session", variant="secondary")
+            del_btn = gr.Button("Delete session", variant="stop")
+            dl_btn = gr.Button("Download JSON", variant="secondary")
+            dl_file = gr.File(label="", interactive=False, visible=False)
+        with gr.Column(scale=9):
+            chat = gr.Chatbot(
+                label="Chat",
+                height=560,
+                render_markdown=True,
+                type="messages",
+            )
+            user_box = gr.Textbox(
+                label="Your message",
+                placeholder="Type and press Enter…",
+                autofocus=True,
+            )
+            send = gr.Button("Send", variant="primary")
+    # States
+    messages = gr.State([])   # includes system
+    msg_id = gr.State("")
+    sessions = gr.State([])
+    # Events
+    user_box.submit(
+        on_send,
+        inputs=[user_box, messages, msg_id, sessions, sys_prompt, temperature, top_p, max_new_tokens, repetition_penalty],
+        outputs=[user_box, messages, chat, msg_id, session_list, sessions],
+    )
+    send.click(
+        on_send,
+        inputs=[user_box, messages, msg_id, sessions, sys_prompt, temperature, top_p, max_new_tokens, repetition_penalty],
+        outputs=[user_box, messages, chat, msg_id, session_list, sessions],
+    )
+    new_btn.click(
+        start_new_session,
+        inputs=[sessions],
+        outputs=[messages, chat, user_box, msg_id, session_list, sessions],
+    )
+    del_btn.click(
+        delete_session,
+        inputs=[msg_id, sessions],
+        outputs=[messages, chat, user_box, msg_id, session_list, sessions],
+    )
+    session_list.change(
+        load_session,
+        inputs=[session_list, sessions],
+        outputs=[msg_id, messages, chat, session_list],
+    )
+    dl_btn.click(
+        on_click_download,
+        inputs=[messages, msg_id],
+        outputs=[dl_file],
+    )
+    demo.load(_init_sessions, None, outputs=[session_list, sessions, msg_id, messages, chat])
+if __name__ == "__main__":
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==5.49.1
+huggingface_hub>=0.23
+orjson
+llama-cpp-python==0.2.90

utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# from __future__ import annotations
+from pathlib import Path
+import uuid
+from datetime import datetime, timezone
+import json, os
+from typing import List, Dict, Tuple, Optional
+# ============ 工具函数 ============
+def mk_msg_dir(BASE_MSG_DIR) -> str:
+    m_id = datetime.now().strftime("%Y%m%d-%H%M%S-") + uuid.uuid4().hex[:6]
+    Path(BASE_MSG_DIR, m_id).mkdir(parents=True, exist_ok=True)
+    return m_id  # 只返回 ID
+def _as_dir(BASE_MSG_DIR, m_id: str) -> str:
+    # 统一把传入值规整为 ./msgs/<ID>
+    return Path(BASE_MSG_DIR, m_id)
+def msg2hist(persona, msg):
+    chat_history = []
+    if msg != None:
+        if len(msg)>0:
+            chat_history = msg.copy()                 # 外层列表浅拷
+            chat_history[0] = msg[0].copy()           # 这个字典单独拷
+            chat_history[0]['content'] = chat_history[0]['content'][len(persona):]
+    return chat_history
+def render(tok, messages: List[Dict[str, str]]) -> str:
+    """按 chat_template 渲染成最终提示词文本（不分词）。"""
+    return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+def _ensure_alternating(messages):
+    if not messages:
+        return
+    if messages[0]["role"] != "user":
+        raise ValueError("messages[0] 必须是 'user'（你的模板要求从 user 开始）")
+    for i, m in enumerate(messages):
+        expect_user = (i % 2 == 0)
+        if (m["role"] == "user") != expect_user:
+            raise ValueError(f"对话必须严格交替 user/assistant，在索引 {i} 处发现 {m['role']}")
+def trim_by_tokens(tok, messages, prompt_budget):
+    """
+    只保留 messages[0]（persona 的 user）+ 一个“从奇数索引开始的后缀”，
+    用二分法找到能放下的最长后缀。这样可保证交替不被破坏。
+    """
+    if not messages:
+        return []
+    # _ensure_alternating(messages)
+    # 只有 persona 这一条时，直接返回
+    if len(messages) == 1:
+        return messages
+    # 允许的后缀起点：奇数索引（index 1,3,5,... 都是 assistant），
+    # 这样拼接到 index0(user) 后才能保持交替。
+    cand_idx = [k for k in range(1, len(messages)) if k % 2 == 1]
+    # 如果任何也放不下，就只留 persona
+    best = [messages[0]]
+    # 二分：起点越靠前 → 保留消息越多 → token 越大（单调）
+    lo, hi = 0, len(cand_idx) - 1
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        k = cand_idx[mid]
+        candidate = [messages[0]] + messages[k:]
+        toks = len(tok(tok.apply_chat_template(candidate, tokenize=False),
+                       add_special_tokens=False).input_ids)
+        if toks <= prompt_budget:
+            best = candidate     # 能放下：尝试保留更多（向左走）
+            hi = mid - 1
+        else:
+            lo = mid + 1         # 放不下：丢更多旧消息（向右走）
+    return best
+# ============ 原子写 可能会和onedrive同步冲突============
+# def atomic_write_json(path: Path, data) -> None:
+#     tmp = path.with_suffix(path.suffix + ".tmp")
+#     with open(tmp, "w", encoding="utf-8") as f:
+#         json.dump(data, f, ensure_ascii=False, indent=2)
+#         f.flush()
+#         os.fsync(f.fileno())
+#     os.replace(tmp, path)  # 同目录原子替换
+# 直接覆盖
+def write_json_overwrite(path: Path, data) -> None:
+    with open(path, "w", encoding="utf-8", newline="\n") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+# ============ 存储层 ============
+class MsgStore:
+    def __init__(self, base_dir: str | Path = "./msgs"):
+        self.base = Path(base_dir)
+        self.base.mkdir(parents=True, exist_ok=True)
+        self.archive = self.base / "archive.jsonl"  # 只追加
+        self.trimmed = self.base / "trimmed.json"   # 当前上下文
+        if not self.archive.exists():
+            self.archive.write_text("", encoding="utf-8")
+        if not self.trimmed.exists():
+            self.trimmed.write_text("[]", encoding="utf-8")
+    def load_trimmed(self) -> List[Dict[str, str]]:
+        try:
+            return json.loads(self.trimmed.read_text(encoding="utf-8"))
+        except Exception:
+            return []
+    def save_trimmed(self, messages: List[Dict[str, str]]) -> None:
+        write_json_overwrite(self.trimmed, messages)
+    def append_archive(self, role: str, content: str, meta: dict | None = None) -> None:
+        rec = {"ts": datetime.now(timezone.utc).isoformat(), "role": role, "content": content}
+        if meta: rec["meta"] = meta
+        with open(self.archive, "a", encoding="utf-8") as f:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+            f.flush(); os.fsync(f.fileno())
+# ============ 显式保存（手动调用才落盘） ============
+def persist_messages(
+    messages: List[Dict[str, str]],
+    store_dir: str | Path = "./msgs",
+    archive_last_turn: bool = True,
+) -> None:
+    store = MsgStore(store_dir)
+    # _ensure_alternating(messages)
+    # 1) 覆写 trimmed.json（原子）
+    store.save_trimmed(messages)
+    # 2) 追加最近一轮到 archive.jsonl（可选）
+    if not archive_last_turn:
+        return
+    # 从尾部向前找最近的一对 (user, assistant)
+    pair = None
+    for i in range(len(messages) - 2, -1, -1):
+        if (
+            messages[i]["role"] == "user"
+            and i + 1 < len(messages)
+            and messages[i + 1]["role"] == "assistant"
+        ):
+            pair = (messages[i]["content"], messages[i + 1]["content"])
+            break
+    if pair:
+        u, a = pair
+        store.append_archive("user", u)
+        store.append_archive("assistant", a)
+    # 若没有找到成对（比如你在生成前就调用了 persist），就只写 trimmed，不归档