jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -3,12 +3,12 @@ import base64
 # Important: the NVIDIA L40S will only support small resolutions, short length and no post-processing.
 # If you want those features, you might need to use the NVIDIA A100.
 # Use your own Inference Endpoint URL
 API_URL = "https://<use your own Inference Endpoint here>.endpoints.huggingface.cloud"
 # Use you own API token
 API_TOKEN = "hf_<replace by your own Hugging Face token>"
 def query(payload):
     response = requests.post(API_URL, headers={
         "Accept": "application/json",
@@ -17,7 +17,16 @@ def query(payload):
     }, json=payload)
     return response.json()
-def save_video(json_response):
     video_data_uri = ""
     try:
         # Extract the video data URI from the response
@@ -35,32 +44,78 @@ def save_video(json_response):
     video_data = base64.b64decode(base64_data)
     # Write the binary data to an MP4 file
-    with open("video.mp4", "wb") as f:
         f.write(video_data)
-# Make the API call
-output = query({
     "inputs": {
-        "prompt": "Portrait photo, selfie of a beautiful young caucasian woman called Charlotte, wearing a pastel-blue hoodie. She is livestreaming from NYC streets. She looks straight into the camera, looking serious, and she talks. The camera is fixed, static, a medium-shot centered on her face. 4K webcam footage. Intricate details, super resolution, sharp image, award winning."
     },
     "parameters": {
         # ------------------- settings for LTX-Video -----------------------
-        # for a vertical video look
-        "width": 480,
-        "height": 768,
         # LTX-Video requires a frame number divisible by 8, plus one frame
         # note: glitches might appear if you use more than 168 frames
-        "num_frames": (8 * 14) + 1,
         # using 30 steps seems to be enough for most cases, otherwise use 50 for best quality
         # I think using a large number of steps (> 30) might create some overexposure and saturation
-        "num_inference_steps": 40,
         # values between 3.0 and 4.0 are nice
-        "guidance_scale": 3.5,
-        # seed: -1,
         # ------------------- settings for Varnish -----------------------
         # This will double the number of frames.
@@ -83,15 +138,24 @@ output = query({
         # and if you do, adding more than 12% will start to negatively impact file size (video codecs aren't great are compressing film grain)
         # 0% = no grain
         # 10% = a bit of grain
-        "grain_amount": 10, # value between 0-100
-        # the following parameters are a work in progress
-        "enable_audio": False,
-        #"audio_prompt": "voices, voice, talking, speaking, speech",
-        #"audio_negative_prompt": "",
     }
-})
 # Save the video
-save_video(output)

 # Important: the NVIDIA L40S will only support small resolutions, short length and no post-processing.
 # If you want those features, you might need to use the NVIDIA A100.
 # Use your own Inference Endpoint URL
 API_URL = "https://<use your own Inference Endpoint here>.endpoints.huggingface.cloud"
 # Use you own API token
 API_TOKEN = "hf_<replace by your own Hugging Face token>"
 def query(payload):
     response = requests.post(API_URL, headers={
         "Accept": "application/json",
     }, json=payload)
     return response.json()
+def save_video(json_response, filename):
+    try:
+        error = json_response["error"]
+        if error:
+            print(error)
+            return
+    except Exception as e:
+        pass
     video_data_uri = ""
     try:
         # Extract the video data URI from the response
     video_data = base64.b64decode(base64_data)
     # Write the binary data to an MP4 file
+    with open(filename, "wb") as f:
         f.write(video_data)
+def encode_image(image_path):
+    """
+    Load and encode an image file to base64
+    Args:
+        image_path (str): Path to the image file
+    Returns:
+        str: Base64 encoded image data URI
+    """
+    with Image.open(image_path) as img:
+        # Convert to RGB if necessary
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        # Save image to bytes
+        img_byte_arr = BytesIO()
+        img.save(img_byte_arr, format="JPEG")
+        # Encode to base64
+        base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+        return f"data:image/jpeg;base64,{base64_encoded}"
+# Example usage with image-to-video generation
+image_filename = "input.jpg"
+video_filename = "output.mp4"
+config = {
     "inputs": {
+       #"prompt": "magnificent underwater footage, clownfishes swimming around coral inside the carribean sea, real gopro footage",
+       # OR
+       "image": encode_image(image_filename)
     },
     "parameters": {
         # ------------------- settings for LTX-Video -----------------------
+        #"negative_prompt": "saturated, highlight, overexposed, highlighted, overlit, shaking, too bright, worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres",
+        # note about resolution:
+        # we cannot use 720 since it cannot be divided by 32
+        #
+        # for a cinematic look:
+        "width": 768,
+        "height": 480,
+        # this is a hack to fool LTX-Video into believing our input image is an actual video frame with poor encoding quality
+        #"input_image_quality": 70,
+        # for a vertical video look:
+        #"width": 480,
+        #"height": 768,
         # LTX-Video requires a frame number divisible by 8, plus one frame
         # note: glitches might appear if you use more than 168 frames
+        "num_frames": (8 * 16) + 1,
         # using 30 steps seems to be enough for most cases, otherwise use 50 for best quality
         # I think using a large number of steps (> 30) might create some overexposure and saturation
+        "num_inference_steps": 50,
         # values between 3.0 and 4.0 are nice
+        "guidance_scale": 4.0,
+        #"seed": 1209877,
+        # ----------------------------------------------------------------
         # ------------------- settings for Varnish -----------------------
         # This will double the number of frames.
         # and if you do, adding more than 12% will start to negatively impact file size (video codecs aren't great are compressing film grain)
         # 0% = no grain
         # 10% = a bit of grain
+        "grain_amount": 12, # value between 0-100
+        # The range of the CRF scale is 0–51, where:
+        # 0 is lossless (for 8 bit only, for 10 bit use -qp 0)
+        # 23 is the default
+        # 51 is worst quality possible
+        # A lower value generally leads to higher quality, and a subjectively sane range is 17–28.
+        # Consider 17 or 18 to be visually lossless or nearly so;
+        # it should look the same or nearly the same as the input but it isn't technically lossless.
+        # The range is exponential, so increasing the CRF value +6 results in roughly half the bitrate / file size, while -6 leads to roughly twice the bitrate.
+        #"quality": 18,
     }
+}
+# Make the API call
+output = query(config)
 # Save the video
+save_video(output, video_filename)