--- # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 # Doc / guide: https://huggingface.co/docs/hub/model-cards library_name: nanovlm license: mit pipeline_tag: image-text-to-text tags: - vision-language - multimodal - research --- **Usage:** Clone the nanoVLM repository: https://github.com/huggingface/nanoVLM. Follow the install instructions and run the following code: ```python from models.vision_language_model import VisionLanguageModel model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-230M-8k-2") ``` ``` "results": { "docvqa_val_anls": 0.772357483001645, "docvqa_val_anls_stderr": 0.005426685320059236, "infovqa_val_anls": 0.27699762812164563, "infovqa_val_anls_stderr": 0.007008485987608615, "mme_mme_cognition_score": 242.85714285714286, "mme_mme_perception_score": 1132.1020408163265, "mmmu_val_mmmu_acc": 0.29889, "mmstar_coarse perception": 0.48384750306507346, "mmstar_average": 0.36924953254941295, "mmstar_fine-grained perception": 0.36457258658806185, "mmstar_instance reasoning": 0.344255653063192, "mmstar_logical reasoning": 0.34837483748374837, "mmstar_math": 0.390821304464047, "mmstar_science & technology": 0.283625310632355, "ocrbench_ocrbench_accuracy": 0.727, "scienceqa_exact_match": 0.5868898844612119, "scienceqa_exact_match_stderr": 0.0075618541822843575, "textvqa_val_exact_match": 0.59624, "textvqa_val_exact_match_stderr": 0.006634483834869976, "chartqa_relaxed_overall": 0.732, "chartqa_relaxed_overall_stderr": 0.008860124193276758, "chartqa_relaxed_human_split": 0.544, "chartqa_relaxed_human_split_stderr": 0.014092909141495816, "chartqa_relaxed_augmented_split": 0.92, "chartqa_relaxed_augmented_split_stderr": 0.007676401612303169, "ai2d_exact_match": 0.41547927461139894, "ai2d_exact_match_stderr": 0.008869646776634895, "mathvista_testmini_cot_gpt_eval_score": 32.1, "mathvista_testmini_format_gpt_eval_score": 36.7, "mathvista_testmini_solution_gpt_eval_score": 32.7 } ```