Spaces:
Running
Running
| from dataclasses import dataclass | |
| from langchain_core.pydantic_v1 import Field, create_model | |
| import yaml, json, os, shutil | |
| class PromptCatalog: | |
| domain_knowledge_example: str = "" | |
| similarity: str = "" | |
| OCR: str = "" | |
| n_fields: int = 0 | |
| ############################################################################################# | |
| ############################################################################################# | |
| ############################################################################################# | |
| ############################################################################################# | |
| # These are for dynamically creating your own prompts with n-columns | |
| def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False): | |
| self.OCR = self.remove_colons_and_double_apostrophes(OCR) | |
| self.rules_config_path = rules_config_path | |
| self.rules_config = self.load_rules_config() | |
| self.instructions = self.rules_config['instructions'] | |
| self.json_formatting_instructions = self.rules_config['json_formatting_instructions'] | |
| self.rules_list = self.rules_config['rules'] | |
| self.n_fields = len(self.rules_config['rules']) | |
| # Set the rules for processing OCR into JSON format | |
| self.rules = self.create_rules(is_palm) | |
| self.structure, self.dictionary_structure = self.create_structure(is_palm) | |
| ''' between instructions and json_formatting_instructions. Made the prompt too long. Better performance without it | |
| The unstructured OCR text is: | |
| {self.OCR} | |
| ''' | |
| if is_palm: | |
| prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly. | |
| The rules are: | |
| {self.instructions} | |
| {self.json_formatting_instructions} | |
| This is the JSON template that includes instructions for each key: | |
| {self.rules} | |
| The unstructured OCR text is: | |
| {self.OCR} | |
| Please populate the following JSON dictionary based on the rules and the unformatted OCR text: | |
| {self.dictionary_structure} | |
| {self.dictionary_structure} | |
| {self.dictionary_structure} | |
| """ | |
| else: | |
| prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly. | |
| The rules are: | |
| {self.instructions} | |
| {self.json_formatting_instructions} | |
| This is the JSON template that includes instructions for each key: | |
| {self.rules} | |
| The unstructured OCR text is: | |
| {self.OCR} | |
| Please populate the following JSON dictionary based on the rules and the unformatted OCR text: | |
| {self.dictionary_structure} | |
| """ | |
| # xlsx_headers = self.generate_xlsx_headers(is_palm) | |
| # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers | |
| # print(prompt) | |
| return prompt, self.dictionary_structure | |
| def remove_colons_and_double_apostrophes(self, text): | |
| return text.replace(":", "").replace("\"", "") | |
| def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path): | |
| # Ensure the target directory exists, create it if it doesn't | |
| if not os.path.exists(new_directory_path): | |
| os.makedirs(new_directory_path) | |
| # Define the path for the new file location | |
| new_file_path = os.path.join(new_directory_path, os.path.basename(rules_config_path)) | |
| # Copy the file to the new location | |
| try: | |
| shutil.copy(rules_config_path, new_file_path) | |
| print(f"Prompt [{os.path.basename(rules_config_path)}] copied successfully to {new_file_path}") | |
| except Exception as exc: | |
| print(f"Error copying [{os.path.basename(rules_config_path)}] file: {exc}") | |
| def load_rules_config(self): | |
| with open(self.rules_config_path, 'r') as stream: | |
| try: | |
| return yaml.safe_load(stream) | |
| except yaml.YAMLError as exc: | |
| print(exc) | |
| return None | |
| def create_rules(self, is_palm=False): | |
| dictionary_structure = {key: value for key, value in self.rules_list.items()} | |
| # Convert the structure to a JSON string without indentation | |
| structure_json_str = json.dumps(dictionary_structure, sort_keys=False) | |
| return structure_json_str | |
| def create_structure(self, is_palm=False): | |
| # # Create fields for the Pydantic model dynamically | |
| # fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()} | |
| # # Dynamically create the Pydantic model | |
| # DynamicJSONParsingModel = create_model('SLTPvA', **fields) | |
| # DynamicJSONParsingModel_use = DynamicJSONParsingModel() | |
| # # Define the structure for the "Dictionary" section | |
| # dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()} | |
| # # Dynamically create the "Dictionary" Pydantic model | |
| # PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields) | |
| # # Convert the model to JSON string (for demonstration) | |
| # dictionary_structure = PromptJSONModel().dict() | |
| # structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4) | |
| # Directly create the dictionary structure with empty strings as default values | |
| dictionary_structure = {key: '' for key in self.rules_list.keys()} | |
| # Convert the dictionary to JSON string for demonstration if needed | |
| structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4) | |
| # print(structure_json_str) | |
| # print(dictionary_structure) | |
| return structure_json_str, dictionary_structure | |
| def generate_xlsx_headers(self, is_palm): | |
| # Extract headers from the 'Dictionary' keys in the JSON template rules | |
| if is_palm: | |
| xlsx_headers = list(self.rules_list.keys()) | |
| return xlsx_headers | |
| else: | |
| xlsx_headers = list(self.rules_list.keys()) | |
| return xlsx_headers | |