Spaces:

ActiveYixiao
/

automatic_coding

Sleeping

App Files Files Community

ActiveYixiao commited on Aug 29

Commit

7660c81

verified ·

1 Parent(s): fe02206

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -64

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 import textwrap
 from typing import Literal, Optional
-import os
 import gradio as gr
 import outlines
 import pandas as pd
@@ -17,16 +17,9 @@ from transformers import (
     BitsAndBytesConfig,
 )
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-MODEL_ID = "rshwndsz/ft-longformer-base-4096"
-DEVICE_MAP = "auto"
-QUANTIZATION_BITS = None
-TEMPERATURE = 0.0
 AVAILABLE_MODELS = [
     "rshwndsz/ft-longformer-base-4096",
     "rshwndsz/ft-hermes-3-llama-3.2-3b",
@@ -41,20 +34,18 @@ AVAILABLE_MODELS = [
 ]
 DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
-# Define response model
-class ResponseModel(BaseModel):
-    score: Literal["0", "1"]
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
 1. A story that was presented to participants as context
 2. The question that participants were asked to answer
 3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
-4. Grading examples
-5. A participant answer
 Your task is to grade each answer according to the grading scheme. For each answer, you should:
 1. Carefully read and understand the answer and compare it to the grading criteria
-2. Assigning an score 1 or 0 for each answer.
 """).strip()
 PROMPT_TEMPLATE = textwrap.dedent("""
@@ -72,28 +63,15 @@ PROMPT_TEMPLATE = textwrap.dedent("""
 </Answer>
 Score:""").strip()
-if is_huggingface_space():
-    DEVICE_MAP = "cpu"
-    QUANTIZATION_BITS = None
-else:
-    DEVICE_MAP = "auto"
-    QUANTIZATION_BITS = 4  # or whatever you prefer for local deployment
-def is_huggingface_space():
-    return os.environ.get('SPACE_ID') is not None
 def get_outlines_model(
-    model_id: str, device_map: str = "cpu", quantization_bits: Optional[int] = None
 ):
-     # Skip quantization on CPU
-    if device_map == "cpu":
-        quantization_config = None
-    else:
-        # Your existing quantization logic
-        pass
     if quantization_bits == 4:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -118,15 +96,14 @@ def get_outlines_model(
         base_model_id,
         device_map=device_map,
         quantization_config=quantization_config,
-        torch_dtype=torch.bfloat16,
     )
     hf_model = PeftModel.from_pretrained(base_model, model_id)
     hf_tokenizer = AutoTokenizer.from_pretrained(
         base_model_id, use_fast=True, clean_up_tokenization_spaces=True
     )
-    hf_tokenizer.pad_token = hf_tokenizer.eos_token
-    return hf_model, hf_tokenizer
 def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
@@ -140,56 +117,95 @@ def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -
     return full_prompt
-# @spaces.GPU
 def label_single_response_with_model(model_id, story, question, criteria, response):
     prompt = format_prompt(story, question, criteria, response)
-    logger.info(f"Prompt: {prompt}")
     if "longformer" in model_id:
         model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=4096)
         with torch.no_grad():
             logits = model(**inputs).logits
-        predicted_class = torch.argmax(logits, dim=1).item()
-        logger.info(f"Predicted class: {predicted_class}")
-        return str(predicted_class)
     else:
-        model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        # Use structured generation with outlines
         generator = generate.json(model, ResponseModel)
-        result = generator(prompt, max_tokens=20)
-        logger.info(f"Generated result: {result}")
-        return result.score
-# @spaces.GPU
-def label_multi_responses_with_model(
-    model_id, story, question, criteria, response_file
-):
     df = pd.read_csv(response_file.name)
     assert "response" in df.columns, "CSV must contain a 'response' column."
     if "longformer" in model_id:
         model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        prompts = [
-            format_prompt(story, question, criteria, resp) for resp in df["response"]
-        ]
-        inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True, max_length=4096)
         with torch.no_grad():
             logits = model(**inputs).logits
-        predicted_classes = torch.argmax(logits, dim=1).tolist()
-        scores = [str(cls) for cls in predicted_classes]
     else:
-        model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
         generator = generate.json(model, ResponseModel)
-        scores = []
-        for resp in df["response"]:
-            prompt = format_prompt(story, question, criteria, resp)
-            result = generator(prompt, max_tokens=20)
-            scores.append(result.score)
     df["score"] = scores
     return df
-# Rest of the code remains the same...

 import logging
 import textwrap
 from typing import Literal, Optional
 import gradio as gr
 import outlines
 import pandas as pd
     BitsAndBytesConfig,
 )
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 AVAILABLE_MODELS = [
     "rshwndsz/ft-longformer-base-4096",
     "rshwndsz/ft-hermes-3-llama-3.2-3b",
 ]
 DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
+DEVICE_MAP = "auto"
+QUANTIZATION_BITS = None
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
 1. A story that was presented to participants as context
 2. The question that participants were asked to answer
 3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
+4. A participant answer
 Your task is to grade each answer according to the grading scheme. For each answer, you should:
 1. Carefully read and understand the answer and compare it to the grading criteria
+2. Assign a score 1 or 0 for each answer.
 """).strip()
 PROMPT_TEMPLATE = textwrap.dedent("""
 </Answer>
 Score:""").strip()
+class ResponseModel(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    score: Literal["0", "1"]
 def get_outlines_model(
+    model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
 ):
     if quantization_bits == 4:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
         base_model_id,
         device_map=device_map,
         quantization_config=quantization_config,
     )
     hf_model = PeftModel.from_pretrained(base_model, model_id)
     hf_tokenizer = AutoTokenizer.from_pretrained(
         base_model_id, use_fast=True, clean_up_tokenization_spaces=True
     )
+    model = outlines.from_transformers(hf_model, hf_tokenizer)
+    return model
 def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
     return full_prompt
+@spaces.GPU
 def label_single_response_with_model(model_id, story, question, criteria, response):
     prompt = format_prompt(story, question, criteria, response)
     if "longformer" in model_id:
         model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        inputs = tokenizer(response, return_tensors="pt", truncation=True, padding=True)
         with torch.no_grad():
             logits = model(**inputs).logits
+        if logits.shape[1] == 1:
+            # Regression-style: apply sigmoid threshold at 0.5
+            score = int(torch.sigmoid(logits).item() > 0.5)
+        else:
+            # Classification-style: argmax over 2 labels
+            score = torch.argmax(logits, dim=1).item()
+        return str(score)
     else:
+        model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
         generator = generate.json(model, ResponseModel)
+        result = generator(prompt)
+        return result["score"]
+@spaces.GPU
+def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
     df = pd.read_csv(response_file.name)
     assert "response" in df.columns, "CSV must contain a 'response' column."
+    prompts = [
+        format_prompt(story, question, criteria, resp) for resp in df["response"]
+    ]
     if "longformer" in model_id:
         model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        inputs = tokenizer(df["response"].tolist(), return_tensors="pt", truncation=True, padding=True)
         with torch.no_grad():
             logits = model(**inputs).logits
+        if logits.shape[1] == 1:
+            scores = [str(int(torch.sigmoid(l) > 0.5)) for l in logits]
+        else:
+            scores = [str(cls) for cls in torch.argmax(logits, dim=1).tolist()]
     else:
+        model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
         generator = generate.json(model, ResponseModel)
+        results = [generator(p) for p in prompts]
+        scores = [r["score"] for r in results]
     df["score"] = scores
     return df
+with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
+    model_selector = gr.Dropdown(
+        label="Select Model",
+        choices=AVAILABLE_MODELS,
+        value=DEFAULT_MODEL_ID,
+    )
+    with gr.Tabs():
+        with gr.Tab("Single Response"):
+            gr.Interface(
+                fn=label_single_response_with_model,
+                inputs=[
+                    model_selector,
+                    gr.Textbox(label="Story", lines=6),
+                    gr.Textbox(label="Question", lines=2),
+                    gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
+                    gr.Textbox(label="Single Response", lines=3),
+                ],
+                outputs=gr.Textbox(label="Score"),
+                live=False,
+            )
+        with gr.Tab("Batch (CSV)"):
+            gr.Interface(
+                fn=label_multi_responses_with_model,
+                inputs=[
+                    model_selector,
+                    gr.Textbox(label="Story", lines=6),
+                    gr.Textbox(label="Question", lines=2),
+                    gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
+                    gr.File(
+                        label="Responses CSV (.csv with 'response' column)",
+                        file_types=[".csv"]
+                    ),
+                ],
+                outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
+                live=False,
+            )
+if __name__ == "__main__":
+    iface.launch(share=True)