ActiveYixiao commited on
Commit
7660c81
·
verified ·
1 Parent(s): fe02206

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -64
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import logging
2
  import textwrap
3
  from typing import Literal, Optional
4
- import os
5
  import gradio as gr
6
  import outlines
7
  import pandas as pd
@@ -17,16 +17,9 @@ from transformers import (
17
  BitsAndBytesConfig,
18
  )
19
 
20
-
21
-
22
  logging.basicConfig(level=logging.INFO)
23
  logger = logging.getLogger(__name__)
24
 
25
- MODEL_ID = "rshwndsz/ft-longformer-base-4096"
26
- DEVICE_MAP = "auto"
27
- QUANTIZATION_BITS = None
28
- TEMPERATURE = 0.0
29
-
30
  AVAILABLE_MODELS = [
31
  "rshwndsz/ft-longformer-base-4096",
32
  "rshwndsz/ft-hermes-3-llama-3.2-3b",
@@ -41,20 +34,18 @@ AVAILABLE_MODELS = [
41
  ]
42
  DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
43
 
44
- # Define response model
45
- class ResponseModel(BaseModel):
46
- score: Literal["0", "1"]
47
 
48
  SYSTEM_PROMPT = textwrap.dedent("""
49
  You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
50
  1. A story that was presented to participants as context
51
  2. The question that participants were asked to answer
52
  3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
53
- 4. Grading examples
54
- 5. A participant answer
55
  Your task is to grade each answer according to the grading scheme. For each answer, you should:
56
  1. Carefully read and understand the answer and compare it to the grading criteria
57
- 2. Assigning an score 1 or 0 for each answer.
58
  """).strip()
59
 
60
  PROMPT_TEMPLATE = textwrap.dedent("""
@@ -72,28 +63,15 @@ PROMPT_TEMPLATE = textwrap.dedent("""
72
  </Answer>
73
  Score:""").strip()
74
 
75
- if is_huggingface_space():
76
- DEVICE_MAP = "cpu"
77
- QUANTIZATION_BITS = None
78
- else:
79
- DEVICE_MAP = "auto"
80
- QUANTIZATION_BITS = 4 # or whatever you prefer for local deployment
81
-
82
- def is_huggingface_space():
83
- return os.environ.get('SPACE_ID') is not None
84
 
 
 
 
85
 
86
 
87
  def get_outlines_model(
88
- model_id: str, device_map: str = "cpu", quantization_bits: Optional[int] = None
89
  ):
90
- # Skip quantization on CPU
91
- if device_map == "cpu":
92
- quantization_config = None
93
- else:
94
- # Your existing quantization logic
95
- pass
96
-
97
  if quantization_bits == 4:
98
  quantization_config = BitsAndBytesConfig(
99
  load_in_4bit=True,
@@ -118,15 +96,14 @@ def get_outlines_model(
118
  base_model_id,
119
  device_map=device_map,
120
  quantization_config=quantization_config,
121
- torch_dtype=torch.bfloat16,
122
  )
123
  hf_model = PeftModel.from_pretrained(base_model, model_id)
124
  hf_tokenizer = AutoTokenizer.from_pretrained(
125
  base_model_id, use_fast=True, clean_up_tokenization_spaces=True
126
  )
127
- hf_tokenizer.pad_token = hf_tokenizer.eos_token
128
 
129
- return hf_model, hf_tokenizer
 
130
 
131
 
132
  def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
@@ -140,56 +117,95 @@ def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -
140
  return full_prompt
141
 
142
 
143
- # @spaces.GPU
144
  def label_single_response_with_model(model_id, story, question, criteria, response):
145
  prompt = format_prompt(story, question, criteria, response)
146
- logger.info(f"Prompt: {prompt}")
147
 
148
  if "longformer" in model_id:
149
  model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
150
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=4096)
151
  with torch.no_grad():
152
  logits = model(**inputs).logits
153
- predicted_class = torch.argmax(logits, dim=1).item()
154
- logger.info(f"Predicted class: {predicted_class}")
155
- return str(predicted_class)
 
 
 
 
 
 
156
  else:
157
- model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
158
-
159
- # Use structured generation with outlines
160
  generator = generate.json(model, ResponseModel)
161
- result = generator(prompt, max_tokens=20)
162
- logger.info(f"Generated result: {result}")
163
- return result.score
164
 
165
 
166
- # @spaces.GPU
167
- def label_multi_responses_with_model(
168
- model_id, story, question, criteria, response_file
169
- ):
170
  df = pd.read_csv(response_file.name)
171
  assert "response" in df.columns, "CSV must contain a 'response' column."
172
-
 
 
 
173
  if "longformer" in model_id:
174
  model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
175
- prompts = [
176
- format_prompt(story, question, criteria, resp) for resp in df["response"]
177
- ]
178
- inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True, max_length=4096)
179
  with torch.no_grad():
180
  logits = model(**inputs).logits
181
- predicted_classes = torch.argmax(logits, dim=1).tolist()
182
- scores = [str(cls) for cls in predicted_classes]
 
 
183
  else:
184
- model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
185
  generator = generate.json(model, ResponseModel)
186
- scores = []
187
- for resp in df["response"]:
188
- prompt = format_prompt(story, question, criteria, resp)
189
- result = generator(prompt, max_tokens=20)
190
- scores.append(result.score)
191
 
192
  df["score"] = scores
193
  return df
194
 
195
- # Rest of the code remains the same...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
  import textwrap
3
  from typing import Literal, Optional
4
+
5
  import gradio as gr
6
  import outlines
7
  import pandas as pd
 
17
  BitsAndBytesConfig,
18
  )
19
 
 
 
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
 
 
 
 
 
23
  AVAILABLE_MODELS = [
24
  "rshwndsz/ft-longformer-base-4096",
25
  "rshwndsz/ft-hermes-3-llama-3.2-3b",
 
34
  ]
35
  DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
36
 
37
+ DEVICE_MAP = "auto"
38
+ QUANTIZATION_BITS = None
 
39
 
40
  SYSTEM_PROMPT = textwrap.dedent("""
41
  You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
42
  1. A story that was presented to participants as context
43
  2. The question that participants were asked to answer
44
  3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
45
+ 4. A participant answer
 
46
  Your task is to grade each answer according to the grading scheme. For each answer, you should:
47
  1. Carefully read and understand the answer and compare it to the grading criteria
48
+ 2. Assign a score 1 or 0 for each answer.
49
  """).strip()
50
 
51
  PROMPT_TEMPLATE = textwrap.dedent("""
 
63
  </Answer>
64
  Score:""").strip()
65
 
 
 
 
 
 
 
 
 
 
66
 
67
+ class ResponseModel(BaseModel):
68
+ model_config = ConfigDict(extra="forbid")
69
+ score: Literal["0", "1"]
70
 
71
 
72
  def get_outlines_model(
73
+ model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
74
  ):
 
 
 
 
 
 
 
75
  if quantization_bits == 4:
76
  quantization_config = BitsAndBytesConfig(
77
  load_in_4bit=True,
 
96
  base_model_id,
97
  device_map=device_map,
98
  quantization_config=quantization_config,
 
99
  )
100
  hf_model = PeftModel.from_pretrained(base_model, model_id)
101
  hf_tokenizer = AutoTokenizer.from_pretrained(
102
  base_model_id, use_fast=True, clean_up_tokenization_spaces=True
103
  )
 
104
 
105
+ model = outlines.from_transformers(hf_model, hf_tokenizer)
106
+ return model
107
 
108
 
109
  def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
 
117
  return full_prompt
118
 
119
 
120
+ @spaces.GPU
121
  def label_single_response_with_model(model_id, story, question, criteria, response):
122
  prompt = format_prompt(story, question, criteria, response)
 
123
 
124
  if "longformer" in model_id:
125
  model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
126
+ inputs = tokenizer(response, return_tensors="pt", truncation=True, padding=True)
127
  with torch.no_grad():
128
  logits = model(**inputs).logits
129
+
130
+ if logits.shape[1] == 1:
131
+ # Regression-style: apply sigmoid threshold at 0.5
132
+ score = int(torch.sigmoid(logits).item() > 0.5)
133
+ else:
134
+ # Classification-style: argmax over 2 labels
135
+ score = torch.argmax(logits, dim=1).item()
136
+ return str(score)
137
+
138
  else:
139
+ model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
 
 
140
  generator = generate.json(model, ResponseModel)
141
+ result = generator(prompt)
142
+ return result["score"]
 
143
 
144
 
145
+ @spaces.GPU
146
+ def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
 
 
147
  df = pd.read_csv(response_file.name)
148
  assert "response" in df.columns, "CSV must contain a 'response' column."
149
+ prompts = [
150
+ format_prompt(story, question, criteria, resp) for resp in df["response"]
151
+ ]
152
+
153
  if "longformer" in model_id:
154
  model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
155
+ inputs = tokenizer(df["response"].tolist(), return_tensors="pt", truncation=True, padding=True)
 
 
 
156
  with torch.no_grad():
157
  logits = model(**inputs).logits
158
+ if logits.shape[1] == 1:
159
+ scores = [str(int(torch.sigmoid(l) > 0.5)) for l in logits]
160
+ else:
161
+ scores = [str(cls) for cls in torch.argmax(logits, dim=1).tolist()]
162
  else:
163
+ model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
164
  generator = generate.json(model, ResponseModel)
165
+ results = [generator(p) for p in prompts]
166
+ scores = [r["score"] for r in results]
 
 
 
167
 
168
  df["score"] = scores
169
  return df
170
 
171
+
172
+ with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
173
+ model_selector = gr.Dropdown(
174
+ label="Select Model",
175
+ choices=AVAILABLE_MODELS,
176
+ value=DEFAULT_MODEL_ID,
177
+ )
178
+
179
+ with gr.Tabs():
180
+ with gr.Tab("Single Response"):
181
+ gr.Interface(
182
+ fn=label_single_response_with_model,
183
+ inputs=[
184
+ model_selector,
185
+ gr.Textbox(label="Story", lines=6),
186
+ gr.Textbox(label="Question", lines=2),
187
+ gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
188
+ gr.Textbox(label="Single Response", lines=3),
189
+ ],
190
+ outputs=gr.Textbox(label="Score"),
191
+ live=False,
192
+ )
193
+ with gr.Tab("Batch (CSV)"):
194
+ gr.Interface(
195
+ fn=label_multi_responses_with_model,
196
+ inputs=[
197
+ model_selector,
198
+ gr.Textbox(label="Story", lines=6),
199
+ gr.Textbox(label="Question", lines=2),
200
+ gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
201
+ gr.File(
202
+ label="Responses CSV (.csv with 'response' column)",
203
+ file_types=[".csv"]
204
+ ),
205
+ ],
206
+ outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
207
+ live=False,
208
+ )
209
+
210
+ if __name__ == "__main__":
211
+ iface.launch(share=True)