Spaces:

abby101
/

surveyor-0

Running

App Files Files Community

Abhipsha Das commited on Dec 15, 2024

Commit

a2b5ed5

unverified ·

1 Parent(s): 8fbb714

add files

Browse files

Files changed (15) hide show

data/databases/README.md +32 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/eval/__init__.py +0 -0
src/eval/metrics.py +87 -0
src/processing/__init__.py +0 -0
src/processing/__pycache__/__init__.cpython-311.pyc +0 -0
src/processing/__pycache__/extractions.cpython-311.pyc +0 -0
src/processing/__pycache__/generate.cpython-311.pyc +0 -0
src/processing/extractions.py +65 -0
src/processing/generate.py +226 -0
src/utils/__init__.py +0 -0
src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
src/utils/__pycache__/utils.cpython-311.pyc +0 -0
src/utils/utils.py +155 -0

data/databases/README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+- This folder contains all the SQL databases for the different processed data along with their raw data.
+- The databases are named after the arXiv category and the format of the generated data.
+Each file in this folder is a database containing 2 tables:
+- **papers**
+    The papers data from the `raw` folder that was fed to the model.
+    SCHEMA:
+    - paper_id TEXT PRIMARY KEY,
+    - abstract TEXT,
+    - authors TEXT,
+    - primary_category TEXT,
+    - url TEXT,
+    - updated_on TEXT,
+    - sentence_count INTEGER
+- **predictions**
+    The corresponding model generations stored in the `results` folder.
+    SCHEMA:
+    - id INTEGER PRIMARY KEY AUTOINCREMENT,
+    - paper_id TEXT,
+    - sentence_index INTEGER,
+    - tag_type TEXT,
+    - concept TEXT,
+    - FOREIGN KEY (paper_id) REFERENCES papers(paper_id)
+To query any database, open SQLite in your terminal and specify the database name.

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (155 Bytes). View file

src/eval/__init__.py ADDED Viewed

File without changes

src/eval/metrics.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Dict
+def classify_predictions(gold: dict, pred: dict, union=False) -> Dict[str, float]:
+    """
+    Returns true positives, false positives, and false negatives for one example
+    If union is True, then disregards the type of the tag and only considers the union of all tags
+    """
+    n_tp = 0
+    n_fp = 0
+    n_fn = 0
+    if union:
+        gold_phrases = set(phrase for phrases in gold.values() for phrase in phrases)
+        pred_phrases = set(phrase for phrases in pred.values() for phrase in phrases)
+        n_tp = len(gold_phrases & pred_phrases)
+        n_fp = len(pred_phrases - gold_phrases)
+        n_fn = len(gold_phrases - pred_phrases)
+        return n_tp, n_fp, n_fn
+    for tag in set(gold.keys()).union(pred.keys()):
+        gold_phrases = set(gold.get(tag, []))
+        pred_phrases = set(pred.get(tag, []))
+        n_tp += len(gold_phrases & pred_phrases)
+        n_fp += len(pred_phrases - gold_phrases)
+        n_fn += len(gold_phrases - pred_phrases)
+    return n_tp, n_fp, n_fn
+def compute_metrics(running_time, pred_times, runtype, eval_metrics=None):
+    metrics = {}
+    metrics["avg_pred_response_time_per_sentence"] = (
+        round(sum(pred_times) / len(pred_times), 4) if pred_times else 0
+    )
+    metrics["total_time"] = round(running_time, 4)
+    if runtype == "eval" and eval_metrics is not None:
+        n_tp, n_fp, n_fn, n_tp_union, n_fp_union, n_fn_union = eval_metrics
+        precision = round(n_tp / (n_tp + n_fp) if (n_tp + n_fp) > 0 else 0, 4)
+        recall = round(n_tp / (n_tp + n_fn) if (n_tp + n_fn) > 0 else 0, 4)
+        f1 = round(
+            (
+                2 * (precision * recall) / (precision + recall)
+                if (precision + recall) > 0
+                else 0
+            ),
+            4,
+        )
+        union_precision = round(
+            (
+                n_tp_union / (n_tp_union + n_fp_union)
+                if (n_tp_union + n_fp_union) > 0
+                else 0
+            ),
+            4,
+        )
+        union_recall = round(
+            (
+                n_tp_union / (n_tp_union + n_fn_union)
+                if (n_tp_union + n_fn_union) > 0
+                else 0
+            ),
+            4,
+        )
+        union_f1 = round(
+            (
+                2 * (union_precision * union_recall) / (union_precision + union_recall)
+                if (union_precision + union_recall) > 0
+                else 0
+            ),
+            4,
+        )
+        metrics.update(
+            {
+                "precision": precision,
+                "recall": recall,
+                "f1": f1,
+                "union_precision": union_precision,
+                "union_recall": union_recall,
+                "union_f1": union_f1,
+            }
+        )
+    return metrics

src/processing/__init__.py ADDED Viewed

File without changes

src/processing/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (166 Bytes). View file

src/processing/__pycache__/extractions.cpython-311.pyc ADDED Viewed

Binary file (4.36 kB). View file

src/processing/__pycache__/generate.cpython-311.pyc ADDED Viewed

Binary file (9.78 kB). View file

src/processing/extractions.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import json
+import logging
+import re
+from bs4 import BeautifulSoup
+from collections import defaultdict
+from typing import Dict, List
+# TODO: review the functions here
+def extract_all_tagged_phrases(text: str) -> Dict[str, List[str]]:
+    soup = BeautifulSoup(text, "html.parser")
+    tagged_phrases = defaultdict(list)
+    for tag in soup.find_all(True):
+        if tag.name:
+            # Clean and process the text
+            full_text = " ".join(tag.stripped_strings)
+            full_text = re.sub(r"\s+", " ", full_text.strip())
+            full_text = re.sub(r'(?<!\\)\\(?!["\\])', r"\\\\", full_text)
+            full_text = full_text.replace('"', '\\"')
+            if full_text:  # Only add non-empty strings
+                tagged_phrases[tag.name].append(full_text)
+    # Remove duplicates while preserving order
+    return {
+        tag: list(dict.fromkeys(phrases)) for tag, phrases in tagged_phrases.items()
+    }
+def extract_prediction(schema: dict, prediction: str, kind: str = "json") -> dict:
+    pred = {}
+    if kind == "json":
+        json_match = re.search(r"\{[\s\S]+\}", prediction)
+        if json_match:
+            json_str = json_match.group(0)
+            json_str = re.sub(r"(\w+)-\$?\\?(\w+)\$?", r"\1-\2", json_str)
+            json_str = json_str.replace('\\"', '"')
+            json_str = re.sub(r'}\s*"', '}, "', json_str)
+            json_str = re.sub(r']\s*"', '], "', json_str)
+            try:
+                pred = json.loads(json_str)
+            except json.JSONDecodeError as e:
+                logging.warning(f"Failed to parse JSON: {json_str}")
+                logging.warning(f"Error: {str(e)}")
+                try:
+                    json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
+                    json_str = re.sub(r"(?<![\w'])'|'(?![\w'])", '"', json_str)
+                    pred = json.loads(json_str)
+                except json.JSONDecodeError:
+                    logging.error(
+                        f"Failed to parse JSON even after attempted fixes: {json_str}"
+                    )
+    elif kind == "readable":
+        match = re.findall(
+            rf'^({"|".join(list(schema.keys()))}): (.+)$',
+            prediction,
+            flags=re.MULTILINE,
+        )
+        pred = {tag: values.split(", ") for tag, values in match}
+    else:
+        raise ValueError(f"Invalid kind: {kind}")
+    return pred

src/processing/generate.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import json
+import random
+import re
+# import spacy
+import torch
+from config import (
+    DEFAULT_FEW_SHOT_NUM,
+    DEFAULT_FEW_SHOT_SELECTION,
+    DEFAULT_TEMPERATURE,
+    DEFAULT_TOP_P,
+    DEFAULT_KIND,
+)
+from typing import List, Dict, Tuple, Union
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from .extractions import extract_all_tagged_phrases
+# nlp = spacy.load("en_core_web_sm")
+# TODO: run with constituency tests
+# TODO: review instruction and system level prompt (currently they are repetitive)
+def get_sentences(text: str) -> List[str]:
+    # TODO: spacy splitting results in unequal lengths
+    # doc = nlp(text)
+    # sentences = [sent.text.strip() for sent in doc.sents]
+    # sentences = [s for s in sentences if s]
+    # return sentences
+    return text.split(". ")
+def format_instance(sentence: str, extraction: Union[str, None]) -> str:
+    return "".join(
+        [
+            f"Sentence: {sentence}\n",
+            (
+                f"Extractions:\n{extraction}\n"
+                if extraction is not None
+                else f"Extractions:\n"
+            ),
+        ]
+    )
+def generate_instructions(schema: dict, kind: str = DEFAULT_KIND) -> str:
+    instruction_parts = [
+        "The following schema is provided to tag the title and abstract of a given scientific paper as shown in the examples:\n"
+    ]
+    if kind == "json":
+        instruction_parts.append(f"{json.dumps(schema, indent=2)}\n\n")
+    elif kind == "readable":
+        readable_schema = ""
+        for tag, description in schema.items():
+            readable_schema += f"{tag}: {description}\n"
+        instruction_parts.append(f"{readable_schema}\n")
+    else:
+        raise ValueError(f"Invalid kind: {kind}")
+    return "".join(instruction_parts)
+def generate_demonstrations(
+    examples: List[dict],
+    kind: str = DEFAULT_KIND,
+    num_examples: int = DEFAULT_FEW_SHOT_NUM,
+    selection: str = DEFAULT_FEW_SHOT_SELECTION,
+) -> str:
+    demonstration_parts = []
+    for example in examples:
+        sentences = get_sentences(example["abstract"])
+        tagged_sentences = get_sentences(example["tagged_abstract"])
+        paired_sentences = list(zip(sentences, tagged_sentences, strict=True))
+        if selection == "random":
+            selected_pairs = random.sample(
+                paired_sentences, min(num_examples, len(paired_sentences))
+            )
+        elif selection == "first":
+            selected_pairs = paired_sentences[:num_examples]
+        elif selection == "last":
+            selected_pairs = paired_sentences[-num_examples:]
+        elif selection == "middle":
+            start = max(0, (len(paired_sentences) - num_examples) // 2)
+            selected_pairs = paired_sentences[start : start + num_examples]
+        elif selection == "distributed":
+            step = max(1, len(paired_sentences) // num_examples)
+            selected_pairs = paired_sentences[::step][:num_examples]
+        elif selection == "longest":
+            selected_pairs = sorted(
+                paired_sentences, key=lambda x: len(x[0]), reverse=True
+            )[:num_examples]
+        elif selection == "shortest":
+            selected_pairs = sorted(paired_sentences, key=lambda x: len(x[0]))[
+                :num_examples
+            ]
+        else:
+            raise ValueError(f"Invalid selection method: {selection}")
+        for sentence, tagged_sentence in selected_pairs:
+            tag_to_phrase = extract_all_tagged_phrases(tagged_sentence)
+            if kind == "json":
+                extractions = f"{json.dumps(tag_to_phrase, indent=2)}\n"
+            elif kind == "readable":
+                extractions = "".join(
+                    f"{tag}: {', '.join(phrase)}\n"
+                    for tag, phrase in tag_to_phrase.items()
+                )
+            else:
+                raise ValueError(f"Invalid kind: {kind}")
+            demonstration_parts.append(format_instance(sentence, extractions))
+    return "".join(demonstration_parts)
+def generate_prefix(instructions: str, demonstrations: str) -> str:
+    return f"{instructions}" f"{demonstrations}"
+def generate_prediction(
+    model,
+    tokenizer,
+    prefix: str,
+    input: str,
+    kind: str,
+    system_prompt: str = f"You are an assistant who tags papers according to given schema and "
+    "only returns the tagged phrases in the format as provided in the examples "
+    "without repeating anything else.",
+    temperature: float = DEFAULT_TEMPERATURE,
+    top_p: float = DEFAULT_TOP_P,
+) -> str:
+    prompt = prefix + input
+    messages = [
+        {
+            "role": "system",
+            "content": system_prompt,
+        },
+        {"role": "user", "content": prompt},
+    ]
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        # add_generation_prompt=True,
+        return_tensors="pt",
+    ).to(model.device)
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
+    outputs = model.generate(
+        input_ids,
+        max_new_tokens=1200,
+        eos_token_id=terminators,
+        # num_beams=8,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    response = outputs[0][input_ids.shape[-1] :]
+    prediction_response = tokenizer.decode(response, skip_special_tokens=True)
+    return prediction_response
+def batch_generate_prediction(
+    model,
+    tokenizer,
+    prefix: str,
+    input_ids: torch.Tensor,
+    kind: str,
+    system_prompt: str = "You are an assistant who tags papers according to given schema and "
+    "only returns the tagged phrases in the format as provided in the examples "
+    "without repeating anything else.",
+    temperature: float = DEFAULT_TEMPERATURE,
+    top_p: float = DEFAULT_TOP_P,
+    max_new_tokens: int = 1200,
+    batch_size: int = 1,
+    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+) -> List[str]:
+    all_predictions = []
+    # Prepare system message
+    system_message = {"role": "system", "content": system_prompt}
+    for i in range(0, input_ids.size(0), batch_size):
+        batch_input_ids = input_ids[i : i + batch_size]
+        batch_messages = [
+            [
+                system_message,
+                {
+                    "role": "user",
+                    "content": prefix + tokenizer.decode(ids, skip_special_tokens=True),
+                },
+            ]
+            for ids in batch_input_ids
+        ]
+        batch_input_ids = tokenizer.apply_chat_template(
+            batch_messages, return_tensors="pt", padding=True, truncation=True
+        ).to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                batch_input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                pad_token_id=tokenizer.pad_token_id,
+                attention_mask=batch_input_ids.ne(tokenizer.pad_token_id),
+            )
+        for output in outputs:
+            response = output[batch_input_ids.size(1) :]
+            prediction_response = tokenizer.decode(response, skip_special_tokens=True)
+            all_predictions.append(prediction_response)
+        torch.cuda.empty_cache()
+    return all_predictions

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (161 Bytes). View file

src/utils/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (8.47 kB). View file

src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+import logging
+import os
+import torch
+from config import DEFAULT_RES_DIR as RES_DIR
+from accelerate import (
+    infer_auto_device_map,
+    init_empty_weights,
+    Accelerator,
+    load_checkpoint_and_dispatch,
+)
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+def save_results(
+    out_dir_path,
+    all_inputs,
+    gold_tags,
+    predicted_responses,
+    predicted_tags,
+    metrics,
+    runtype,
+    append=False,
+):
+    mode = "a" if append else "w"
+    with open(
+        os.path.join(RES_DIR, out_dir_path, "prompts.txt"), mode, encoding="utf-8"
+    ) as f:
+        for input, gold_tag, pred_response, pred_tag in zip(
+            all_inputs, gold_tags, predicted_responses, predicted_tags
+        ):
+            f.write(f"{input}\n")
+            f.write(f"True Tag: {gold_tag}\n")
+            f.write(f"Predicted Response: {pred_response}\n")
+            f.write(f"Predicted Tag: {pred_tag}\n")
+            f.write("#" * 50 + "\n")
+    with open(
+        os.path.join(RES_DIR, out_dir_path, "predicted_responses.txt"),
+        mode,
+        encoding="utf-8",
+    ) as f:
+        for response in predicted_responses:
+            f.write(f"{response}\n")
+            f.write("#" * 50 + "\n")
+    if append:
+        with open(os.path.join(RES_DIR, out_dir_path, "predictions.json"), "r+") as f:
+            data = json.load(f)
+            data["predicted_tags"].extend(predicted_tags)
+            f.seek(0)
+            json.dump(data, f, indent=4)
+            f.truncate()
+    else:
+        with open(os.path.join(RES_DIR, out_dir_path, "predictions.json"), "w") as f:
+            json.dump({"predicted_tags": predicted_tags}, f, indent=4)
+    if runtype == "eval":
+        if append:
+            with open(
+                os.path.join(RES_DIR, out_dir_path, "ground_truth.json"), "r+"
+            ) as f:
+                data = json.load(f)
+                data["gold_tags"].extend(gold_tag)
+                f.seek(0)
+                json.dump(data, f, indent=4)
+                f.truncate()
+        else:
+            with open(
+                os.path.join(RES_DIR, out_dir_path, "ground_truth.json"), "w"
+            ) as f:
+                json.dump({"gold_tags": gold_tags}, f, indent=4)
+    with open(os.path.join(RES_DIR, out_dir_path, "metrics.json"), "w") as f:
+        json.dump({"metrics": metrics, "prompt_file": "prompts.txt"}, f, indent=4)
+    logging.info(f"Results saved in: {os.path.join(RES_DIR, out_dir_path)}")
+def save_best_config(metrics, config):
+    best_config_path = os.path.join(RES_DIR, "best_config.json")
+    if os.path.exists(best_config_path):
+        with open(best_config_path, "r") as f:
+            best_config = json.load(f)
+        if metrics["precision"] > best_config["metrics"]["precision"]:
+            best_config = {"metrics": metrics, "config": config}
+    else:
+        best_config = {"metrics": metrics, "config": config}
+    with open(best_config_path, "w") as f:
+        json.dump(best_config, f, indent=4)
+def load_sweep_config(config_path="sweep_config.json"):
+    with open(config_path, "r") as f:
+        return json.load(f)
+# def load_model_and_tokenizer(model_id: str):
+#     accelerator = Accelerator()
+#     tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+#     # device_map = infer_auto_device_map(model, max_memory=max_memory)
+#     if tokenizer.pad_token_id is None:
+#         tokenizer.pad_token_id = tokenizer.eos_token_id
+#     model = AutoModelForCausalLM.from_pretrained(
+#         model_id,
+#         torch_dtype=torch.bfloat16,
+#         device_map="auto",
+#         token=os.getenv("HF_TOKEN"),
+#     )
+#     model, tokenizer = accelerator.prepare(model, tokenizer)
+#     return model, tokenizer
+def clear_cuda_cache():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.memory.reset_max_memory_allocated()
+        torch.cuda.memory.reset_max_memory_cached()
+def load_model_and_tokenizer(model_id):
+    # Set up memory-saving options
+    torch.cuda.empty_cache()
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Initialize tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_id, padding_side="left", use_auth_token=os.getenv("HF_TOKEN")
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    # Load configuration
+    config = AutoConfig.from_pretrained(model_id, use_auth_token=os.getenv("HF_TOKEN"))
+    # Load model
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        config=config,
+        torch_dtype=torch.float16,
+        use_auth_token=os.getenv("HF_TOKEN"),
+        device_map="auto",
+    )
+    return model, tokenizer