Automatic Schema Induction(text-to-schema) Model
This model is a sub-task of text-to-json task that generates a JSON template given a text.
Usage
import json
import torch
from transformers import AutoModel, AutoTokenizer
model_name = "chnaaam/luSI-v1.0"
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
text = """์์ด์ (IU, ๋ณธ๋ช
: ์ด์ง์, ๆ็ฅๆฉ[1], 1993๋
5์ 16์ผ~)๋ ๋ํ๋ฏผ๊ตญ์ ์ฑ์ด์ก๋ผ์ดํฐ, ์๊ณก๊ฐ, ๋ฐฐ์ฐ์ด๋ค. 2007๋
๋ก์ ์ํฐํ
์ธ๋จผํธ(ํ ์นด์นด์ค ์ํฐํ
์ธ๋จผํธ) ์ฐ์ต์์ผ๋ก ์ ์ ๊ณ์ฝ์ ๋งบ๊ณ 15์ธ์ ๋์ด์ 2008๋
์ฒซ EP์ธ ๋ก์คํธ ์ค ํ์ด๋(Lost and Found)๋ฅผ ํตํด ๊ฐ์๋ก ๋ฐ๋ทํ๋ค."""
messages = [
{"role": "user", "content": text}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=1024,
temperature=0.0
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
json_template = json.loads(response)
print(json_template)
Output
{
'Person': {
'Name': '',
'Stage name': '',
'Real name': '',
'Birth date': '',
'Nationality': '',
'Occupations': [],
'Debut': {
'Age': '',
'Year': '',
'Company': '',
'Contract type': '',
'EP': '',
'EP title': ''
}
}
}
- Downloads last month
- 1