Spaces:

kgourgou
/

llm-decoders

Running

App Files Files Community

kgourgou commited on Feb 16

Commit

c816679

verified ·

1 Parent(s): eab7f9b

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -5

app.py CHANGED Viewed

@@ -14,7 +14,8 @@ torch.set_num_threads(2)
 def min_p_sampling(logits, pbase=0.1):
     """
-    Perform min-p sampling on the logits.
     Args:
         logits (torch.Tensor): 1D tensor of logits for the next token.
@@ -47,6 +48,96 @@ def min_p_sampling(logits, pbase=0.1):
     return sampled_index.item()
 def generate_completion(prompt, strategy, params):
     """
     Generate a complete answer using model.generate with specified parameters.
@@ -59,12 +150,12 @@ def generate_completion(prompt, strategy, params):
         # Generate the output.
         output_ids = model.generate(
-            input_ids, attention_mask=attention_mask, max_length=50, **params
         )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
-def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
     past = None
     with torch.no_grad():
@@ -94,7 +185,7 @@ def generate_all(prompt):
         "Greedy": {"type": "default", "params": {"do_sample": False}},
         "Top-k Sampling": {
             "type": "default",
-            "params": {"do_sample": True, "top_k": 50},
         },
         "Top-p Sampling": {
             "type": "default",
@@ -113,6 +204,14 @@ def generate_all(prompt):
             "params": {"do_sample": True, "epsilon_cutoff": 0.2},
         },
         "Min-p Sampling": {"type": "min_p", "pbase": 0.1},
     }
     # Define the order for display.
@@ -124,6 +223,8 @@ def generate_all(prompt):
         "Min-p Sampling",
         "Eta Sampling",
         "Epsilon Sampling",
     ]
     results = {method: None for method in methods}
@@ -142,6 +243,11 @@ def generate_all(prompt):
                 future = executor.submit(
                     generate_min_p_completion, prompt, info["pbase"]
                 )
             future_to_method[future] = method
         # As each future completes, update its result and yield the current state.
@@ -169,9 +275,15 @@ interface = gr.Interface(
         gr.Textbox(label="Top-k Sampling"),
         gr.Textbox(label="Top-p Sampling"),
         gr.Textbox(label="Beam Search"),
-        gr.Textbox(label="Min-p Sampling"),
         gr.Textbox(label="Eta Sampling"),
         gr.Textbox(label="Epsilon Sampling"),
     ],
     title="Decoding Methods Comparison",
     description="Each decoding method's final answer is printed as soon as it is done. Model used: GPT-2.",

 def min_p_sampling(logits, pbase=0.1):
     """
+    Perform min-p sampling on the logits. As described in
+    https://arxiv.org/abs/2407.01082
     Args:
         logits (torch.Tensor): 1D tensor of logits for the next token.
     return sampled_index.item()
+def generate_laconic_completion(prompt: str, n: int = 5, max_length: int = 100):
+    # generate n completions greedily and return the shortest one
+    with torch.no_grad():
+        # Encode the prompt and get the attention mask.
+        encoded = tokenizer(prompt, return_tensors="pt")
+        input_ids = encoded["input_ids"]
+        attention_mask = encoded["attention_mask"]
+        # Generate the output.
+        outputs = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_length=max_length,
+            num_return_sequences=n,
+            do_sample=True,
+        )
+        completions = [
+            tokenizer.decode(output, skip_special_tokens=True) for output in outputs
+        ]
+        return min(completions, key=len)
+def generate_with_confidence(input_ids, max_length):
+    """
+    Generate a sequence using greedy decoding while returning the scores.
+    """
+    outputs = model.generate(
+        input_ids,
+        max_length=max_length,
+        do_sample=False,
+        output_scores=True,
+        return_dict_in_generate=True,
+    )
+    return outputs
+def compute_answer_confidence(outputs):
+    """
+    Compute the answer confidence over the generated tokens.
+    For each generated token, compute the difference between the top-1 and top-2 logits.
+    Returns the average difference.
+    """
+    diffs = []
+    for score in outputs.scores:
+        # Get top-2 logit values
+        top2 = torch.topk(score[0], 2)
+        diff = top2.values[0] - top2.values[1]
+        diffs.append(diff.item())
+    return sum(diffs) / len(diffs) if diffs else 0.0
+def cot_decoding(prompt, k=5, max_length=100):
+    """
+    Perform Chain-of-Thought (CoT) decoding by exploring top-k alternative paths.
+    """
+    input_ids = tokenizer.encode(prompt, return_tensors="pt")
+    # Get logits for the next token
+    with torch.no_grad():
+        outputs = model(input_ids)
+    logits = outputs.logits[0, -1, :]
+    # Get top-k candidate tokens
+    topk = torch.topk(logits, k)
+    candidate_tokens = topk.indices
+    paths = []
+    for token in candidate_tokens:
+        # Append the candidate token to the prompt
+        new_input_ids = torch.cat([input_ids, token.view(1, 1)], dim=1)
+        # Generate a full sequence with output scores
+        gen_outputs = generate_with_confidence(
+            new_input_ids, max_length=new_input_ids.shape[1] + max_length
+        )
+        # Decode the generated sequence
+        generated_text = tokenizer.decode(
+            gen_outputs.sequences[0], skip_special_tokens=True
+        )
+        # Compute answer confidence
+        confidence = compute_answer_confidence(gen_outputs)
+        paths.append({"text": generated_text, "confidence": confidence})
+    return max(paths, key=lambda x: x["confidence"])["text"]
 def generate_completion(prompt, strategy, params):
     """
     Generate a complete answer using model.generate with specified parameters.
         # Generate the output.
         output_ids = model.generate(
+            input_ids, attention_mask=attention_mask, max_length=100, **params
         )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+def generate_min_p_completion(prompt, pbase=0.1, max_length=100):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
     past = None
     with torch.no_grad():
         "Greedy": {"type": "default", "params": {"do_sample": False}},
         "Top-k Sampling": {
             "type": "default",
+            "params": {"do_sample": True, "top_k": 100},
         },
         "Top-p Sampling": {
             "type": "default",
             "params": {"do_sample": True, "epsilon_cutoff": 0.2},
         },
         "Min-p Sampling": {"type": "min_p", "pbase": 0.1},
+        "laconic": {
+            "type": "default",
+            "params": {"do_sample": True, "num_return_sequences": 5},
+        },
+        "COT Decoding": {
+            "type": "cot_decoding",
+            "params": {"k": 5, "max_length": 100},
+        },
     }
     # Define the order for display.
         "Min-p Sampling",
         "Eta Sampling",
         "Epsilon Sampling",
+        "laconic",
+        "COT Decoding",
     ]
     results = {method: None for method in methods}
                 future = executor.submit(
                     generate_min_p_completion, prompt, info["pbase"]
                 )
+            elif method == "laconic":
+                future = executor.submit(generate_laconic_completion, prompt)
+            elif method == "COT Decoding":
+                future = executor.submit(cot_decoding, prompt, **info["params"])
             future_to_method[future] = method
         # As each future completes, update its result and yield the current state.
         gr.Textbox(label="Top-k Sampling"),
         gr.Textbox(label="Top-p Sampling"),
         gr.Textbox(label="Beam Search"),
+        gr.Textbox(label="Min-p Sampling (as in https://arxiv.org/abs/2407.01082)"),
         gr.Textbox(label="Eta Sampling"),
         gr.Textbox(label="Epsilon Sampling"),
+        gr.Textbox(
+            label="laconic decoding (by Alex Dimakis, 2025, search for twitter thread)"
+        ),
+        gr.Textbox(
+            label="COT Decoding (Chain-of-Thought Reasoning without Prompting, Wang, Zhou, 2024)"
+        ),
     ],
     title="Decoding Methods Comparison",
     description="Each decoding method's final answer is printed as soon as it is done. Model used: GPT-2.",