Spaces:
Sleeping
Sleeping
| import logging | |
| import textwrap | |
| from typing import Literal, Optional | |
| import gradio as gr | |
| import outlines | |
| import pandas as pd | |
| import spaces | |
| import torch | |
| from peft import PeftConfig, PeftModel | |
| from pydantic import BaseModel, ConfigDict | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoModelForSequenceClassification, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| ) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| MODEL_ID = "rshwndsz/ft-longformer-base-4096" | |
| DEVICE_MAP = "auto" | |
| QUANTIZATION_BITS = 4 | |
| TEMPERATURE = 0.0 | |
| AVAILABLE_MODELS = [ | |
| "rshwndsz/ft-longformer-base-4096", | |
| "rshwndsz/ft-hermes-3-llama-3.2-3b", | |
| "rshwndsz/ft-phi-3.5-mini-instruct", | |
| "rshwndsz/ft-mistral-7b-v0.3-instruct", | |
| "rshwndsz/ft-phi-4", | |
| "rshwndsz/ft_paraphrased-hermes-3-llama-3.2-3b", | |
| "rshwndsz/ft_paraphrased-longformer-base-4096", | |
| "rshwndsz/ft_paraphrased-phi-3.5-mini-instruct", | |
| "rshwndsz/ft_paraphrased-mistral-7b-v0.3-instruct", | |
| "rshwndsz/ft_paraphrased-phi-4", | |
| ] | |
| DEFAULT_MODEL_ID = AVAILABLE_MODELS[0] | |
| # Exact SYSTEM_PROMPT from training data | |
| SYSTEM_PROMPT = textwrap.dedent(""" | |
| You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information: | |
| 1. A story that was presented to participants as context | |
| 2. The question that participants were asked to answer | |
| 3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0) | |
| 4. Grading examples | |
| 5. A participant answer | |
| Your task is to grade each answer according to the grading scheme. For each answer, you should: | |
| 1. Carefully read and understand the answer and compare it to the grading criteria | |
| 2. Assigning an score 1 or 0 for each answer. | |
| """).strip() | |
| # Exact PROMPT_TEMPLATE from training data | |
| PROMPT_TEMPLATE = textwrap.dedent(""" | |
| <Story> | |
| {story} | |
| </Story> | |
| <Question> | |
| {question} | |
| </Question> | |
| <GradingScheme> | |
| {grading_scheme} | |
| </GradingScheme> | |
| <Answer> | |
| {answer} | |
| </Answer> | |
| Score:""").strip() | |
| class ResponseModel(BaseModel): | |
| model_config = ConfigDict(extra="forbid") | |
| score: Literal["0", "1"] | |
| def get_outlines_model( | |
| model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4 | |
| ): | |
| if quantization_bits == 4: | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| elif quantization_bits == 8: | |
| quantization_config = BitsAndBytesConfig(load_in_8bit=True) | |
| else: | |
| quantization_config = None | |
| if "longformer" in model_id: | |
| hf_model = AutoModelForSequenceClassification.from_pretrained(model_id) | |
| hf_tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| return hf_model, hf_tokenizer | |
| peft_config = PeftConfig.from_pretrained(model_id) | |
| base_model_id = peft_config.base_model_name_or_path | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| base_model_id, | |
| device_map=device_map, | |
| quantization_config=quantization_config, | |
| ) | |
| hf_model = PeftModel.from_pretrained(base_model, model_id) | |
| hf_tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_id, use_fast=True, clean_up_tokenization_spaces=True | |
| ) | |
| # Updated for new outlines API | |
| model = outlines.models.Transformers(hf_model, hf_tokenizer) | |
| return model | |
| def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str: | |
| # Exact format used during training | |
| prompt = PROMPT_TEMPLATE.format( | |
| story=story.strip(), | |
| question=question.strip(), | |
| grading_scheme=grading_scheme.strip(), | |
| answer=answer.strip(), | |
| ) | |
| # Exact concatenation used during training | |
| full_prompt = SYSTEM_PROMPT + "\n" + prompt | |
| return full_prompt | |
| def label_single_response_with_model(model_id, story, question, criteria, response): | |
| prompt = format_prompt(story, question, criteria, response) | |
| if "longformer" in model_id: | |
| model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| predicted_class = torch.argmax(logits, dim=1).item() | |
| return str(predicted_class) | |
| else: | |
| model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| # Updated for new outlines API | |
| generator = outlines.generate.json(model, ResponseModel) | |
| result = generator(prompt) | |
| return result.score | |
| def label_multi_responses_with_model( | |
| model_id, story, question, criteria, response_file | |
| ): | |
| df = pd.read_csv(response_file.name) | |
| assert "response" in df.columns, "CSV must contain a 'response' column." | |
| prompts = [ | |
| format_prompt(story, question, criteria, resp) for resp in df["response"] | |
| ] | |
| if "longformer" in model_id: | |
| model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| predicted_classes = torch.argmax(logits, dim=1).tolist() | |
| scores = [str(cls) for cls in predicted_classes] | |
| else: | |
| model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| # Updated for new outlines API | |
| generator = outlines.generate.json(model, ResponseModel) | |
| results = generator(prompts) | |
| scores = [r.score for r in results] | |
| df["score"] = scores | |
| return df | |
| def single_response_ui(model_id): | |
| return gr.Interface( | |
| fn=lambda story, question, criteria, response: label_single_response_with_model( | |
| model_id.value, story, question, criteria, response | |
| ), | |
| inputs=[ | |
| gr.Textbox(label="Story", lines=6), | |
| gr.Textbox(label="Question", lines=2), | |
| gr.Textbox(label="Criteria (Grading Scheme)", lines=4), | |
| gr.Textbox(label="Single Response", lines=3), | |
| ], | |
| outputs=gr.Textbox(label="Score"), | |
| live=False, | |
| ) | |
| def multi_response_ui(model_id): | |
| return gr.Interface( | |
| fn=lambda story, | |
| question, | |
| criteria, | |
| response_file: label_multi_responses_with_model( | |
| model_id.value, story, question, criteria, response_file | |
| ), | |
| inputs=[ | |
| gr.Textbox(label="Story", lines=6), | |
| gr.Textbox(label="Question", lines=2), | |
| gr.Textbox(label="Criteria (Grading Scheme)", lines=4), | |
| gr.File( | |
| label="Responses CSV (.csv with 'response' column)", file_types=[".csv"] | |
| ), | |
| ], | |
| outputs=gr.Dataframe(label="Labeled Responses", type="pandas"), | |
| live=False, | |
| ) | |
| with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface: | |
| model_selector = gr.Dropdown( | |
| label="Select Model", | |
| choices=AVAILABLE_MODELS, | |
| value=AVAILABLE_MODELS[0], | |
| ) | |
| selected_model_id = gr.State(value=DEFAULT_MODEL_ID) | |
| def update_model_id(choice): | |
| return choice | |
| model_selector.change( | |
| fn=update_model_id, inputs=model_selector, outputs=selected_model_id | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Single Response"): | |
| single_response_ui(selected_model_id) | |
| with gr.Tab("Batch (CSV)"): | |
| multi_response_ui(selected_model_id) | |
| if __name__ == "__main__": | |
| iface.launch(share=True) | |