Will it be possible to run this on PC with 8 GeForce RTX 3060 with 8 Gb VRAM each?
#11
by
						
ai2p
	
							
						- opened
							
					
Can it correctly span VRAM between many GPU cards? Or it needs to have all required VRAM in one videocard only?
Yes
@ai2p
	 Sure you can! Here is an example to load model across multiple devices (need to install accelerate first):
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from accelerate.utils import get_balanced_memory, infer_auto_device_map
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
import torch
def load_model(model_name):
    weights_path = snapshot_download(model_name)
    config = AutoConfig.from_pretrained(model_name)
    # This will init model with meta tensors, which basically does nothing.
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(config)
    max_memory = get_balanced_memory(
        model,
        max_memory=None,
        no_split_module_classes=["GPTNeoXLayer"],
        dtype='float16',
        low_zero=False,
    )
    device_map = infer_auto_device_map(
        model, 
        max_memory=max_memory,
        no_split_module_classes=["GPTNeoXLayer"], 
        dtype='float16'
    )
    model = load_checkpoint_and_dispatch(
        model, weights_path, device_map=device_map, no_split_module_classes=["GPTNeoXLayer"]
    )
    return model
model_name = 'togethercomputer/GPT-NeoXT-Chat-Base-20B'
model = load_model(model_name)

