Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

App Files Files Community

LLMEval-Dataset-Parser / llmdataparser /bbh_parser.py

JeffYang52415

refactor: remove system prompt

0450c4e unverified about 1 year ago

raw

history blame contribute delete

6.52 kB

	from dataclasses import dataclass
	from typing import Any, ClassVar, List

	from llmdataparser.base_parser import (
	DatasetDescription,
	EvaluationMetric,
	HuggingFaceDatasetParser,
	HuggingFaceParseEntry,
	)


	@dataclass(frozen=True, kw_only=True, slots=True)
	class BBHParseEntry(HuggingFaceParseEntry):
	"""Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset."""

	@classmethod
	def create(
	cls,
	question: str,
	answer: str,
	raw_question: str,
	raw_answer: str,
	task_name: str,
	) -> "BBHParseEntry":
	return cls(
	question=question,
	answer=answer,
	raw_question=raw_question,
	raw_answer=raw_answer,
	task_name=task_name,
	)


	class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
	"""Parser for the Big Bench Hard dataset."""

	_data_source: ClassVar[str] = "lukaemon/bbh"
	_task_names: ClassVar[list[str]] = [
	"boolean_expressions",
	"causal_judgement",
	"date_understanding",
	"disambiguation_qa",
	"dyck_languages",
	"formal_fallacies",
	"geometric_shapes",
	"hyperbaton",
	"logical_deduction_five_objects",
	"logical_deduction_seven_objects",
	"logical_deduction_three_objects",
	"movie_recommendation",
	"multistep_arithmetic_two",
	"navigate",
	"object_counting",
	"penguins_in_a_table",
	"reasoning_about_colored_objects",
	"ruin_names",
	"salient_translation_error_detection",
	"snarks",
	"sports_understanding",
	"temporal_sequences",
	"tracking_shuffled_objects_five_objects",
	"tracking_shuffled_objects_seven_objects",
	"tracking_shuffled_objects_three_objects",
	"web_of_lies",
	"word_sorting",
	]
	_default_task: ClassVar[str] = "reasoning_about_colored_objects"

	def process_entry(
	self, row: dict[str, Any], task_name: str \| None = None, **kwargs: Any
	) -> BBHParseEntry:
	"""Process a single BBH entry."""
	raw_question = row["input"]
	raw_answer = row["target"]

	# Remove parentheses from the answer
	clean_answer = raw_answer.strip("()")

	question = str(raw_question)

	# Use task_name if provided, otherwise use default
	task = task_name or self._get_current_task(row)

	return BBHParseEntry.create(
	question=question,
	answer=clean_answer,
	raw_question=raw_question,
	raw_answer=raw_answer,
	task_name=task,
	)

	def get_dataset_description(self) -> DatasetDescription:
	"""Returns a description of the Big Bench Hard dataset."""
	return DatasetDescription.create(
	name="Big Bench Hard (BBH)",
	purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
	source="https://github.com/suzgunmirac/BIG-Bench-Hard",
	language="English",
	format="Multiple choice questions with single correct answers",
	characteristics=(
	"Tasks require complex multi-step reasoning and were selected based on "
	"initial model performance below human baseline. Performance can be "
	"significantly improved through chain-of-thought prompting. The dataset "
	"includes 23 core tasks plus additional related tasks."
	),
	category=["Advanced Reasoning"],
	citation=(
	"@article{suzgun2022challenging,\n"
	" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
	' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
	" journal={arXiv preprint arXiv:2210.09261},\n"
	" year={2022}\n"
	"}"
	),
	additional_info={
	"model_performance": (
	"With chain-of-thought prompting, PaLM surpassed human performance on "
	"10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
	),
	"size": "6.5k examples across 27 tasks (23 core + 4 related)",
	},
	)

	def get_evaluation_metrics(self) -> List[EvaluationMetric]:
	"""Returns the recommended evaluation metrics for BBH dataset."""
	return [
	EvaluationMetric.create(
	name="accuracy",
	type="classification",
	description="Proportion of exactly correct answers (after stripping parentheses)",
	implementation="evaluate.load('accuracy')",
	primary=True,
	),
	EvaluationMetric.create(
	name="human_eval_delta",
	type="comparison",
	description="Difference between model accuracy and average human-rater performance baseline",
	implementation="custom_human_baseline_comparison",
	primary=True,
	),
	EvaluationMetric.create(
	name="per_task_accuracy",
	type="classification",
	description="Accuracy broken down by individual reasoning tasks",
	implementation="custom_task_accuracy",
	primary=False,
	),
	EvaluationMetric.create(
	name="exact_match",
	type="string_match",
	description="Strict exact match between predicted and target answers",
	implementation="evaluate.load('exact_match')",
	primary=False,
	),
	]


	if __name__ == "__main__":
	# Example usage
	parser = BBHDatasetParser()

	# Load the dataset with a specific task
	parser.load(task_name="reasoning_about_colored_objects")

	# Parse all splits
	parser.parse()

	# Get parsed data
	parsed_data = parser.get_parsed_data

	# Print example entry
	if parsed_data:
	example = parsed_data[0]
	print("\nExample parsed entry:")
	print(f"Task: {example.task_name}")
	print(f"Question: {example.question}")
	print(f"Answer: {example.answer}")