|  |  | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import requests | 
					
						
						|  | import io | 
					
						
						|  | import dask.dataframe as dd | 
					
						
						|  | from datasets import load_dataset, Image | 
					
						
						|  | from mlcroissant import Dataset as CroissantDataset | 
					
						
						|  | from huggingface_hub import get_token | 
					
						
						|  | import polars as pl | 
					
						
						|  | import warnings | 
					
						
						|  | import traceback | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | warnings.filterwarnings("ignore") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | DATASET_CONFIG = { | 
					
						
						|  | "caselaw": { | 
					
						
						|  | "name": "common-pile/caselaw_access_project", | 
					
						
						|  | "emoji": "βοΈ", | 
					
						
						|  | "search_col": "text", | 
					
						
						|  | "methods": ["π¨ API (requests)", "π§ Dask", "π₯ Croissant"], | 
					
						
						|  | "is_public": True, | 
					
						
						|  | }, | 
					
						
						|  | "prompts": { | 
					
						
						|  | "name": "fka/awesome-chatgpt-prompts", | 
					
						
						|  | "emoji": "π€", | 
					
						
						|  | "search_col": ["act", "prompt"], | 
					
						
						|  | "methods": ["πΌ Pandas", "π¨ API (requests)", "π₯ Croissant"], | 
					
						
						|  | "is_public": True, | 
					
						
						|  | }, | 
					
						
						|  | "finance": { | 
					
						
						|  | "name": "snorkelai/agent-finance-reasoning", | 
					
						
						|  | "emoji": "π°", | 
					
						
						|  | "search_col": ["question", "answer"], | 
					
						
						|  | "methods": ["πΌ Pandas", "π§ Polars", "π¨ API (requests)", "π₯ Croissant"], | 
					
						
						|  | "is_public": False, | 
					
						
						|  | }, | 
					
						
						|  | "medical": { | 
					
						
						|  | "name": "FreedomIntelligence/medical-o1-reasoning-SFT", | 
					
						
						|  | "emoji": "π©Ί", | 
					
						
						|  | "search_col": "conversations", | 
					
						
						|  | "methods": ["πΌ Pandas", "π§ Polars", "π¨ API (requests)", "π₯ Croissant"], | 
					
						
						|  | "is_public": False, | 
					
						
						|  | }, | 
					
						
						|  | "inscene": { | 
					
						
						|  | "name": "peteromallet/InScene-Dataset", | 
					
						
						|  | "emoji": "πΌοΈ", | 
					
						
						|  | "search_col": "text", | 
					
						
						|  | "methods": ["π€ Datasets", "πΌ Pandas", "π§ Polars", "π¨ API (requests)", "π₯ Croissant"], | 
					
						
						|  | "is_public": False, | 
					
						
						|  | }, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_auth_headers(): | 
					
						
						|  | """π Creates authorization headers if a Hugging Face token is available.""" | 
					
						
						|  | token = get_token() | 
					
						
						|  | return {"Authorization": f"Bearer {token}"} if token else {} | 
					
						
						|  |  | 
					
						
						|  | def dataframe_to_outputs(df: pd.DataFrame): | 
					
						
						|  | """ | 
					
						
						|  | π Takes a DataFrame and magically transforms it into various formats for your viewing pleasure. | 
					
						
						|  | Like a data chameleon! | 
					
						
						|  | """ | 
					
						
						|  | if df.empty: | 
					
						
						|  | return "No results found. π€·", None, None, "No results to copy." | 
					
						
						|  |  | 
					
						
						|  | df_str = df.astype(str) | 
					
						
						|  | markdown_output = df_str.to_markdown(index=False) | 
					
						
						|  |  | 
					
						
						|  | csv_buffer = io.StringIO() | 
					
						
						|  | df.to_csv(csv_buffer, index=False) | 
					
						
						|  | csv_buffer.seek(0) | 
					
						
						|  |  | 
					
						
						|  | excel_buffer = io.BytesIO() | 
					
						
						|  | df.to_excel(excel_buffer, index=False, engine='openpyxl') | 
					
						
						|  | excel_buffer.seek(0) | 
					
						
						|  |  | 
					
						
						|  | tab_delimited_output = df.to_csv(sep='\t', index=False) | 
					
						
						|  |  | 
					
						
						|  | return markdown_output, gr.File.from_bytes(csv_buffer.getvalue(), "results.csv"), gr.File.from_bytes(excel_buffer.getvalue(), "results.xlsx"), tab_delimited_output | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def handle_error(e: Exception): | 
					
						
						|  | """ | 
					
						
						|  | π± Oh no! An error! This function catches it and displays it nicely. | 
					
						
						|  | Because even errors deserve to look good. | 
					
						
						|  | """ | 
					
						
						|  | error_message = f"π¨ An error occurred: {str(e)}\n\n" | 
					
						
						|  | auth_tip = "π For gated datasets, did you log in? Try `huggingface-cli login` in your terminal." | 
					
						
						|  | full_trace = traceback.format_exc() | 
					
						
						|  | print(full_trace) | 
					
						
						|  |  | 
					
						
						|  | if "401" in str(e) or "Gated" in str(e): | 
					
						
						|  | error_message += auth_tip | 
					
						
						|  |  | 
					
						
						|  | return ( | 
					
						
						|  | pd.DataFrame(), | 
					
						
						|  | gr.Gallery(None, label="πΌοΈ Image Results"), | 
					
						
						|  | f"```\n{error_message}\n\n{full_trace}\n```", | 
					
						
						|  | None, | 
					
						
						|  | None, | 
					
						
						|  | error_message, | 
					
						
						|  | f"```python\n# π¨ Error during code generation:\n# {e}\n```" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def fetch_data(dataset_key: str, access_method: str, query: str): | 
					
						
						|  | """ | 
					
						
						|  | π The main mission control function! It fetches, searches, and formats data. | 
					
						
						|  | It's the brains of the operation. | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  | config = DATASET_CONFIG[dataset_key] | 
					
						
						|  | repo_id = config["name"] | 
					
						
						|  | search_cols = [config["search_col"]] if isinstance(config["search_col"], str) else config["search_col"] | 
					
						
						|  | df = pd.DataFrame() | 
					
						
						|  | code_snippet = "" | 
					
						
						|  |  | 
					
						
						|  | if "API" in access_method: | 
					
						
						|  | url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100" | 
					
						
						|  | headers = get_auth_headers() if not config["is_public"] else {} | 
					
						
						|  | response = requests.get(url, headers=headers) | 
					
						
						|  | response.raise_for_status() | 
					
						
						|  | data = response.json() | 
					
						
						|  | df = pd.json_normalize(data['rows'], record_path='row', meta=['row_idx', 'truncated_cells']) | 
					
						
						|  | df = df.drop(columns=['row_idx', 'truncated_cells'], errors='ignore') | 
					
						
						|  |  | 
					
						
						|  | code_snippet = f""" | 
					
						
						|  | # π» Generated Code: API (requests) | 
					
						
						|  | import requests | 
					
						
						|  | import pandas as pd | 
					
						
						|  |  | 
					
						
						|  | # For gated datasets, get your token from https://huggingface.co/settings/tokens | 
					
						
						|  | # Make sure to `huggingface-cli login` first. | 
					
						
						|  | headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} | 
					
						
						|  | url = "{url}" | 
					
						
						|  | response = requests.get(url, headers=headers) # Pass headers for gated datasets | 
					
						
						|  | data = response.json() | 
					
						
						|  | df = pd.json_normalize(data['rows'], record_path='row') | 
					
						
						|  | print(df.head()) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | elif "Pandas" in access_method: | 
					
						
						|  | file_path = f"hf://datasets/{repo_id}/" | 
					
						
						|  | if repo_id == "fka/awesome-chatgpt-prompts": | 
					
						
						|  | file_path += "prompts.csv" | 
					
						
						|  | df = pd.read_csv(file_path) | 
					
						
						|  | else: | 
					
						
						|  | try: | 
					
						
						|  | df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet") | 
					
						
						|  | except: | 
					
						
						|  | try: | 
					
						
						|  | df = pd.read_parquet(f"{file_path}train.parquet") | 
					
						
						|  | except: | 
					
						
						|  | df = pd.read_json(f"{file_path}medical_o1_sft.json") | 
					
						
						|  |  | 
					
						
						|  | code_snippet = f""" | 
					
						
						|  | # π» Generated Code: Pandas | 
					
						
						|  | import pandas as pd | 
					
						
						|  |  | 
					
						
						|  | # Make sure to `huggingface-cli login` for gated datasets. | 
					
						
						|  | file_path = "{file_path}" | 
					
						
						|  | df = pd.{'read_csv' if '.csv' in file_path else ('read_json' if '.json' in file_path else 'read_parquet')}(file_path) | 
					
						
						|  | print(df.head()) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | elif "Polars" in access_method: | 
					
						
						|  | file_path = f"hf://datasets/{repo_id}/" | 
					
						
						|  | try: | 
					
						
						|  | df = pl.read_parquet(f"{file_path}data/train-00000-of-00001.parquet").to_pandas() | 
					
						
						|  | except: | 
					
						
						|  | try: | 
					
						
						|  | df = pl.read_parquet(f"{file_path}train.parquet").to_pandas() | 
					
						
						|  | except: | 
					
						
						|  | df = pl.read_json(f"{file_path}medical_o1_sft.json").to_pandas() | 
					
						
						|  |  | 
					
						
						|  | code_snippet = f""" | 
					
						
						|  | # π» Generated Code: Polars | 
					
						
						|  | import polars as pl | 
					
						
						|  |  | 
					
						
						|  | # Make sure to `huggingface-cli login` for gated datasets. | 
					
						
						|  | file_path = "{'hf://datasets/' + repo_id + '/data/train-00000-of-00001.parquet'}" | 
					
						
						|  | df = pl.read_parquet(file_path) | 
					
						
						|  | print(df.head()) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | elif "Datasets" in access_method: | 
					
						
						|  | ds = load_dataset(repo_id, split='train[:100]') | 
					
						
						|  | df = ds.to_pandas() | 
					
						
						|  | code_snippet = f""" | 
					
						
						|  | # π» Generated Code: Datasets | 
					
						
						|  | from datasets import load_dataset | 
					
						
						|  |  | 
					
						
						|  | # Make sure to `huggingface-cli login` for gated datasets. | 
					
						
						|  | ds = load_dataset("{repo_id}", split='train') | 
					
						
						|  | print(ds) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | elif "Dask" in access_method: | 
					
						
						|  | df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz").head(100) | 
					
						
						|  | code_snippet = f""" | 
					
						
						|  | # π» Generated Code: Dask | 
					
						
						|  | import dask.dataframe as dd | 
					
						
						|  |  | 
					
						
						|  | # Make sure to `huggingface-cli login` for gated datasets. | 
					
						
						|  | ddf = dd.read_json("hf://datasets/{repo_id}/**/*.jsonl.gz") | 
					
						
						|  | print(ddf.head()) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | elif "Croissant" in access_method: | 
					
						
						|  | headers = get_auth_headers() if not config["is_public"] else {} | 
					
						
						|  | jsonld_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant" | 
					
						
						|  | jsonld = requests.get(jsonld_url, headers=headers).json() | 
					
						
						|  | ds = CroissantDataset(jsonld=jsonld) | 
					
						
						|  | records = ds.records("default") | 
					
						
						|  | data_rows = [row for _, row in zip(range(100), records)] | 
					
						
						|  | df = pd.DataFrame(data_rows) | 
					
						
						|  | code_snippet = f""" | 
					
						
						|  | # π» Generated Code: Croissant | 
					
						
						|  | import requests | 
					
						
						|  | from mlcroissant import Dataset as CroissantDataset | 
					
						
						|  | import pandas as pd | 
					
						
						|  |  | 
					
						
						|  | # For gated datasets, get your token from https://huggingface.co/settings/tokens | 
					
						
						|  | headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} | 
					
						
						|  | jsonld_url = "{jsonld_url}" | 
					
						
						|  | jsonld = requests.get(jsonld_url, headers=headers).json() | 
					
						
						|  | ds = CroissantDataset(jsonld=jsonld) | 
					
						
						|  | records = ds.records("default") # This is a generator | 
					
						
						|  |  | 
					
						
						|  | # To preview data: | 
					
						
						|  | preview_rows = [row for _, row in zip(range(100), records)] | 
					
						
						|  | df = pd.DataFrame(preview_rows) | 
					
						
						|  | print(df.head()) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if query and not df.empty: | 
					
						
						|  | if dataset_key == 'medical': | 
					
						
						|  | df = df[df['conversations'].apply(lambda x: isinstance(x, list) and len(x) > 1 and query.lower() in str(x[1].get('value', '')).lower())] | 
					
						
						|  | else: | 
					
						
						|  | combined_mask = pd.Series([False] * len(df)) | 
					
						
						|  | for col in search_cols: | 
					
						
						|  | if col in df.columns and pd.api.types.is_string_dtype(df[col]): | 
					
						
						|  | combined_mask |= df[col].str.contains(query, case=False, na=False) | 
					
						
						|  | df = df[combined_mask] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | gallery_output = None | 
					
						
						|  | if dataset_key == 'inscene' and not df.empty: | 
					
						
						|  | gallery_data = [] | 
					
						
						|  | for _, row in df.iterrows(): | 
					
						
						|  | if isinstance(row.get('image'), Image.Image): | 
					
						
						|  | gallery_data.append((row['image'], row.get('text', ''))) | 
					
						
						|  | gallery_output = gr.Gallery(gallery_data, label="πΌοΈ Image Results", height=400) | 
					
						
						|  |  | 
					
						
						|  | md, csv, xlsx, tab = dataframe_to_outputs(df) | 
					
						
						|  | return df, gallery_output, md, csv, xlsx, tab, code_snippet | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | return handle_error(e) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_dataset_tab(dataset_key: str): | 
					
						
						|  | """ | 
					
						
						|  | ποΈ This function builds a whole tab in our UI for a single dataset. | 
					
						
						|  | It's like a little construction worker for Gradio interfaces. | 
					
						
						|  | """ | 
					
						
						|  | config = DATASET_CONFIG[dataset_key] | 
					
						
						|  |  | 
					
						
						|  | with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"): | 
					
						
						|  | gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset") | 
					
						
						|  | if not config['is_public']: | 
					
						
						|  | gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.") | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | access_method = gr.Radio(config['methods'], label="π Access Method", value=config['methods'][0]) | 
					
						
						|  | query = gr.Textbox(label="π Search Query", placeholder="Enter a keyword to search...") | 
					
						
						|  |  | 
					
						
						|  | fetch_button = gr.Button("π Go Fetch!") | 
					
						
						|  |  | 
					
						
						|  | df_output = gr.DataFrame(label="π Results DataFrame", interactive=False, wrap=True) | 
					
						
						|  | gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πΌοΈ Image Results") | 
					
						
						|  |  | 
					
						
						|  | with gr.Accordion("π View/Export Full Results", open=False): | 
					
						
						|  | markdown_output = gr.Markdown(label="π Markdown View") | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | csv_output = gr.File(label="β¬οΈ Download CSV") | 
					
						
						|  | xlsx_output = gr.File(label="β¬οΈ Download XLSX") | 
					
						
						|  |  | 
					
						
						|  | copy_output = gr.Code(label="π Copy-Paste (Tab-Delimited)") | 
					
						
						|  |  | 
					
						
						|  | code_output = gr.Code(label="π» Python Code Snippet", language="python") | 
					
						
						|  |  | 
					
						
						|  | fetch_button.click( | 
					
						
						|  | fn=fetch_data, | 
					
						
						|  | inputs=[gr.State(dataset_key), access_method, query], | 
					
						
						|  | outputs=[df_output, gallery_output, markdown_output, csv_output, xlsx_output, copy_output, code_output] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo: | 
					
						
						|  | gr.Markdown("# π€ Hugging Face Dataset Explorer") | 
					
						
						|  | gr.Markdown( | 
					
						
						|  | "Select a dataset, choose an access method, type a query, and see the results instantly. " | 
					
						
						|  | "The app demonstrates various ways to access and search Hugging Face datasets and generates the code for you!" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Tabs(): | 
					
						
						|  | for key in DATASET_CONFIG.keys(): | 
					
						
						|  | create_dataset_tab(key) | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | demo.launch(debug=True) | 
					
						
						|  |  |