Commit
·
c6dfdac
1
Parent(s):
255cd6e
Load 13B model with 8-bit/4-bit quantization to support more hardwares (#2)
Browse files- Load 13B model with 8-bit/4-bit quantization to support more hardwares (2043a67569994113ef5f4a8d0c58df57f6c2ec66)
- Update requirements.txt (45e69a6796b68457d9e0f2e7bf82cc5f7a38b2b1)
- Update app.py (4e058355a3b5dcf3470e3a49b891eb91455f030b)
- Update app.py (4ad10fb0867be1212b7746919900c9fd16014f69)
Co-authored-by: Haotian Liu <[email protected]>
- app.py +20 -3
- requirements.txt +2 -2
app.py
CHANGED
|
@@ -325,6 +325,14 @@ title_markdown = """
|
|
| 325 |
[[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
|
| 326 |
|
| 327 |
ONLY WORKS WITH GPU!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
"""
|
| 329 |
|
| 330 |
tos_markdown = """
|
|
@@ -522,8 +530,12 @@ def start_controller():
|
|
| 522 |
return subprocess.Popen(controller_command)
|
| 523 |
|
| 524 |
|
| 525 |
-
def start_worker(model_path: str):
|
| 526 |
logger.info(f"Starting the model worker for the model {model_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
worker_command = [
|
| 528 |
"python",
|
| 529 |
"-m",
|
|
@@ -534,7 +546,11 @@ def start_worker(model_path: str):
|
|
| 534 |
"http://localhost:10000",
|
| 535 |
"--model-path",
|
| 536 |
model_path,
|
|
|
|
|
|
|
| 537 |
]
|
|
|
|
|
|
|
| 538 |
return subprocess.Popen(worker_command)
|
| 539 |
|
| 540 |
|
|
@@ -582,12 +598,13 @@ if __name__ == "__main__":
|
|
| 582 |
args = get_args()
|
| 583 |
logger.info(f"args: {args}")
|
| 584 |
|
| 585 |
-
model_path = "liuhaotian/llava-v1.5-
|
|
|
|
| 586 |
|
| 587 |
preload_models(model_path)
|
| 588 |
|
| 589 |
controller_proc = start_controller()
|
| 590 |
-
worker_proc = start_worker(model_path)
|
| 591 |
|
| 592 |
# Wait for worker and controller to start
|
| 593 |
time.sleep(10)
|
|
|
|
| 325 |
[[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
|
| 326 |
|
| 327 |
ONLY WORKS WITH GPU!
|
| 328 |
+
|
| 329 |
+
You can load the model with 8-bit or 4-bit quantization to make it fit in smaller hardwares. Setting the environment variable `bits` to control the quantization.
|
| 330 |
+
|
| 331 |
+
Recommended configurations:
|
| 332 |
+
| Hardware | A10G-Large (24G) | T4-Medium (15G) | A100-Large (40G) |
|
| 333 |
+
|-------------------|------------------|-----------------|------------------|
|
| 334 |
+
| **Bits** | 8 (default) | 4 | 16 |
|
| 335 |
+
|
| 336 |
"""
|
| 337 |
|
| 338 |
tos_markdown = """
|
|
|
|
| 530 |
return subprocess.Popen(controller_command)
|
| 531 |
|
| 532 |
|
| 533 |
+
def start_worker(model_path: str, bits=16):
|
| 534 |
logger.info(f"Starting the model worker for the model {model_path}")
|
| 535 |
+
model_name = model_path.strip('/').split('/')[-1]
|
| 536 |
+
assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
|
| 537 |
+
if bits != 16:
|
| 538 |
+
model_name += f'-{bits}bit'
|
| 539 |
worker_command = [
|
| 540 |
"python",
|
| 541 |
"-m",
|
|
|
|
| 546 |
"http://localhost:10000",
|
| 547 |
"--model-path",
|
| 548 |
model_path,
|
| 549 |
+
"--model-name",
|
| 550 |
+
model_name,
|
| 551 |
]
|
| 552 |
+
if bits != 16:
|
| 553 |
+
worker_command += [f'--load-{bits}bit']
|
| 554 |
return subprocess.Popen(worker_command)
|
| 555 |
|
| 556 |
|
|
|
|
| 598 |
args = get_args()
|
| 599 |
logger.info(f"args: {args}")
|
| 600 |
|
| 601 |
+
model_path = "liuhaotian/llava-v1.5-13b"
|
| 602 |
+
bits = int(os.getenv("bits", 8))
|
| 603 |
|
| 604 |
preload_models(model_path)
|
| 605 |
|
| 606 |
controller_proc = start_controller()
|
| 607 |
+
worker_proc = start_worker(model_path, bits=bits)
|
| 608 |
|
| 609 |
# Wait for worker and controller to start
|
| 610 |
time.sleep(10)
|
requirements.txt
CHANGED
|
@@ -8,8 +8,8 @@ numpy
|
|
| 8 |
requests
|
| 9 |
sentencepiece
|
| 10 |
tokenizers>=0.12.1
|
| 11 |
-
torch
|
| 12 |
-
torchvision
|
| 13 |
uvicorn
|
| 14 |
wandb
|
| 15 |
shortuuid
|
|
|
|
| 8 |
requests
|
| 9 |
sentencepiece
|
| 10 |
tokenizers>=0.12.1
|
| 11 |
+
torch==2.0.1
|
| 12 |
+
torchvision==0.15.2
|
| 13 |
uvicorn
|
| 14 |
wandb
|
| 15 |
shortuuid
|