Spaces:

shriarul5273
/

StereoMatching_Compare_Demo

Running on Zero

App Files Files Community

shriarul5273 commited on Aug 28

Commit

8ca4dce

1 Parent(s): 8e11b47

added CREStereo and FoundationStereo code

Browse files

Files changed (37) hide show

.gitattributes +1 -0
.gitignore +3 -0
CREStereo_demo/app.py +967 -0
CREStereo_demo/app_local.py +889 -0
CREStereo_demo/models/.gitkeep +0 -0
CREStereo_demo/models/crestereo_eth3d.pth +3 -0
CREStereo_demo/nets/__init__.py +1 -0
CREStereo_demo/nets/attention/__init__.py +2 -0
CREStereo_demo/nets/attention/linear_attention.py +81 -0
CREStereo_demo/nets/attention/position_encoding.py +41 -0
CREStereo_demo/nets/attention/transformer.py +100 -0
CREStereo_demo/nets/corr.py +148 -0
CREStereo_demo/nets/crestereo.py +258 -0
CREStereo_demo/nets/extractor.py +123 -0
CREStereo_demo/nets/update.py +91 -0
CREStereo_demo/nets/utils/__init__.py +1 -0
CREStereo_demo/nets/utils/utils.py +108 -0
FoundationStereo_demo/Utils.py +160 -0
FoundationStereo_demo/app.py +1138 -0
FoundationStereo_demo/app_local.py +1292 -0
FoundationStereo_demo/core/extractor.py +371 -0
FoundationStereo_demo/core/foundation_stereo.py +277 -0
FoundationStereo_demo/core/geometry.py +77 -0
FoundationStereo_demo/core/submodule.py +588 -0
FoundationStereo_demo/core/update.py +159 -0
FoundationStereo_demo/core/utils/utils.py +64 -0
FoundationStereo_demo/depth_anything/LICENSE.txt +201 -0
FoundationStereo_demo/depth_anything/__init__.py +2 -0
FoundationStereo_demo/depth_anything/blocks.py +153 -0
FoundationStereo_demo/depth_anything/dpt.py +203 -0
FoundationStereo_demo/depth_anything/util/transform.py +248 -0
assets/example1/K.txt +2 -0
assets/example1/left.png +3 -0
assets/example1/right.png +3 -0
assets/example2/K.txt +9 -0
assets/example2/left.png +3 -0
assets/example2/right.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.Identifier
+__pycache__/
+*.pyc

CREStereo_demo/app.py ADDED Viewed

	@@ -0,0 +1,967 @@

+"""
+CREStereo Gradio Demo with ZeroGPU Integration
+This demo showcases the CREStereo model for stereo depth estimation.
+Optimized for Hugging Face Spaces with ZeroGPU support.
+Key ZeroGPU optimizations:
+- @spaces.GPU decorators for GPU-intensive functions
+- CUDA operations only within GPU context
+- Memory-efficient inference with cleanup
+- Safe CUDA initialization patterns
+"""
+import os
+import sys
+import logging
+import tempfile
+import gc
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import numpy as np
+import cv2
+import gradio as gr
+import imageio
+# Import spaces BEFORE torch to ensure proper ZeroGPU initialization
+import spaces
+# Import torch after spaces - avoid any CUDA calls during import
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.cuda.amp import autocast
+# Completely avoid CUDA operations during import phase
+# Do not set default tensor type or modify CUDA settings outside GPU context
+# torch.set_default_tensor_type('torch.FloatTensor')  # Commented out - causes CUDA init
+# Do not modify CUDA settings during import - this can trigger CUDA initialization
+# torch.backends.cudnn.enabled = False  # Commented out
+# torch.backends.cudnn.benchmark = False  # Commented out
+# Use current directory as base
+current_dir = os.path.dirname(os.path.abspath(__file__))
+base_dir = current_dir
+# Add current directory to path for local imports
+sys.path.insert(0, current_dir)
+# Import local modules
+from nets import Model
+# Import Open3D with error handling
+OPEN3D_AVAILABLE = False
+try:
+    # Set Open3D to CPU mode to avoid CUDA initialization
+    os.environ['OPEN3D_CPU_RENDERING'] = '1'
+    # Don't import open3d here - do it inside functions
+    # import open3d as o3d
+    OPEN3D_AVAILABLE = True  # Assume available, will check later
+except Exception as e:
+    logging.warning(f"Open3D setup failed: {e}")
+    OPEN3D_AVAILABLE = False
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Model configuration
+MODEL_VARIANTS = {
+    "crestereo_eth3d": {
+        "display_name": "CREStereo ETH3D (Pre-trained model)",
+        "model_file": "models/crestereo_eth3d.pth",
+        "max_disp": 256
+    }
+}
+# Global variables for model caching
+_cached_model = None
+_cached_device = None
+_cached_model_selection = None
+class InputPadder:
+    """ Pads images such that dimensions are divisible by divis_by """
+    def __init__(self, dims, divis_by=8, force_square=False):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by
+        pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by
+        if force_square:
+            # Make the padded dimensions square
+            max_dim = max(self.ht + pad_ht, self.wd + pad_wd)
+            pad_ht = max_dim - self.ht
+            pad_wd = max_dim - self.wd
+        self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+def aggressive_cleanup():
+    """Perform basic cleanup - no CUDA operations outside GPU context"""
+    import gc
+    gc.collect()
+    logging.info("Performed basic memory cleanup")
+@spaces.GPU
+def initialize_gpu_context():
+    """Initialize GPU context safely for ZeroGPU"""
+    try:
+        # Set CUDA settings safely within GPU context
+        torch.set_default_tensor_type('torch.cuda.FloatTensor')
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = True
+        # Check GPU availability and log info
+        if torch.cuda.is_available():
+            device_name = torch.cuda.get_device_name(0)
+            memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            logging.info(f"GPU initialized: {device_name}, Total memory: {memory_total:.2f}GB")
+            return True
+        else:
+            logging.error("CUDA not available after GPU context initialization")
+            return False
+    except Exception as e:
+        logging.error(f"GPU context initialization failed: {e}")
+        return False
+@spaces.GPU
+def check_gpu_memory():
+    """Check and log current GPU memory usage - only call within GPU context"""
+    try:
+        allocated = torch.cuda.memory_allocated(0) / 1024**3
+        reserved = torch.cuda.memory_reserved(0) / 1024**3
+        max_allocated = torch.cuda.max_memory_allocated(0) / 1024**3
+        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+        logging.info(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Max: {max_allocated:.2f}GB, Total: {total:.2f}GB")
+        return allocated, reserved, max_allocated, total
+    except RuntimeError as e:
+        logging.warning(f"Failed to get GPU memory info: {e}")
+        return None, None, None, None
+def get_available_models() -> dict:
+    """Get all available models with their display names"""
+    models = {}
+    # Check for local models
+    for variant, info in MODEL_VARIANTS.items():
+        model_path = os.path.join(current_dir, info["model_file"])
+        if os.path.exists(model_path):
+            display_name = info["display_name"]
+            models[display_name] = {
+                "model_path": model_path,
+                "variant": variant,
+                "max_disp": info["max_disp"],
+                "source": "local"
+            }
+    return models
+def get_model_paths_from_selection(model_selection: str) -> Tuple[Optional[str], Optional[dict]]:
+    """Get model path and config from the selected model"""
+    models = get_available_models()
+    # Check if it's in our models dict
+    if model_selection in models:
+        model_info = models[model_selection]
+        logging.info(f"📁 Using local model: {model_selection}")
+        return model_info["model_path"], model_info
+    return None, None
+@spaces.GPU
+def load_model_for_inference(model_path: str, model_info: dict):
+    """Load CREStereo model for inference temporarily (demo-style)"""
+    # Set CUDA settings safely within GPU context
+    torch.set_default_tensor_type('torch.cuda.FloatTensor')  # Now safe to use CUDA tensors
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    # Check if CUDA is available after ZeroGPU initialization
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available. ZeroGPU initialization may have failed.")
+    # Use the first available CUDA device
+    device = torch.device("cuda")
+    # Set CUDA seed safely within GPU context
+    try:
+        random_seed = 0
+        torch.cuda.manual_seed_all(random_seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    except Exception as e:
+        logging.warning(f"Could not set CUDA seed: {e}")
+    try:
+        # Create model
+        max_disp = model_info.get("max_disp", 256)
+        model = Model(max_disp=max_disp, mixed_precision=False, test_mode=True)
+        # Load checkpoint
+        ckpt = torch.load(model_path, map_location=device)
+        model.load_state_dict(ckpt, strict=True)
+        model.to(device)
+        model.eval()
+        logging.info("Loaded CREStereo model weights")
+        # Memory optimizations
+        torch.set_grad_enabled(False)
+        logging.info("Applied memory optimizations")
+        return model, device
+    except Exception as e:
+        logging.error(f"Model loading failed: {e}")
+        raise RuntimeError(f"Failed to load model: {e}")
+def get_cached_model(model_selection: str):
+    """Get cached model or load new one if selection changed"""
+    global _cached_model, _cached_device, _cached_model_selection
+    # Get model paths from selection
+    model_path, model_info = get_model_paths_from_selection(model_selection)
+    if model_path is None or model_info is None:
+        raise ValueError(f"Selected model not found: {model_selection}")
+    # Check if we need to reload the model
+    if (_cached_model is None or
+        _cached_model_selection != model_selection):
+        # Clear previous model if exists
+        if _cached_model is not None:
+            del _cached_model
+            torch.cuda.empty_cache()
+            gc.collect()
+        logging.info(f"🚀 Loading model: {model_selection}")
+        _cached_model, _cached_device = load_model_for_inference(model_path, model_info)
+        _cached_model_selection = model_selection
+        logging.info(f"✅ Model loaded successfully: {model_selection}")
+    else:
+        logging.info(f"✅ Using cached model: {model_selection}")
+    return _cached_model, _cached_device
+def clear_model_cache():
+    """Clear the cached model to free memory"""
+    global _cached_model, _cached_device, _cached_model_selection
+    if _cached_model is not None:
+        logging.info("Clearing model cache...")
+        del _cached_model
+        _cached_model = None
+        _cached_device = None
+        _cached_model_selection = None
+        # Simple cleanup
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+        logging.info("Model cache cleared")
+    else:
+        logging.info("No model in cache to clear")
+def inference(left, right, model, device, n_iter=20):
+    """Run CREStereo inference on stereo pair"""
+    print("Model Forwarding...")
+    imgL = left.transpose(2, 0, 1)
+    imgR = right.transpose(2, 0, 1)
+    imgL = np.ascontiguousarray(imgL[None, :, :, :])
+    imgR = np.ascontiguousarray(imgR[None, :, :, :])
+    imgL = torch.tensor(imgL.astype("float32")).to(device)
+    imgR = torch.tensor(imgR.astype("float32")).to(device)
+    # Use InputPadder to handle any image size
+    padder = InputPadder(imgL.shape, divis_by=8)
+    imgL_padded, imgR_padded = padder.pad(imgL, imgR)
+    # Downsample for coarse prediction
+    imgL_dw2 = F.interpolate(
+        imgL_padded,
+        size=(imgL_padded.shape[2] // 2, imgL_padded.shape[3] // 2),
+        mode="bilinear",
+        align_corners=True,
+    )
+    imgR_dw2 = F.interpolate(
+        imgR_padded,
+        size=(imgL_padded.shape[2] // 2, imgL_padded.shape[3] // 2),
+        mode="bilinear",
+        align_corners=True,
+    )
+    with torch.inference_mode():
+        pred_flow_dw2 = model(imgL_dw2, imgR_dw2, iters=n_iter, flow_init=None)
+        pred_flow = model(imgL_padded, imgR_padded, iters=n_iter, flow_init=pred_flow_dw2)
+    # Unpad the result to original dimensions
+    pred_flow = padder.unpad(pred_flow)
+    pred_disp = torch.squeeze(pred_flow[:, 0, :, :]).cpu().detach().numpy()
+    return pred_disp
+def vis_disparity(disparity_map, max_val=None):
+    """Visualize disparity map"""
+    if max_val is None:
+        disp_vis = (disparity_map - disparity_map.min()) / (disparity_map.max() - disparity_map.min()) * 255.0
+    else:
+        disp_vis = np.clip(disparity_map / max_val * 255.0, 0, 255)
+    disp_vis = disp_vis.astype("uint8")
+    disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO)
+    disp_vis = cv2.cvtColor(disp_vis, cv2.COLOR_BGR2RGB)
+    return disp_vis
+# Fixed with static duration
+@spaces.GPU(duration=60)  # Static 60 seconds for basic processing
+def process_stereo_pair(model_selection: str, left_image: str, right_image: str,
+                       progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], str]:
+    """
+    Main processing function for stereo pair (with model caching)
+    """
+    logging.info("Starting stereo pair processing...")
+    if left_image is None or right_image is None:
+        return None, "❌ Please upload both left and right images."
+    # Convert image paths to numpy arrays
+    logging.info(f"Loading images: left={left_image}, right={right_image}")
+    try:
+        # Load left image
+        if not os.path.exists(left_image):
+            logging.error(f"Left image file does not exist: {left_image}")
+            return None, f"❌ Left image file not found: {left_image}"
+        logging.info(f"Loading left image from: {left_image}")
+        left_img = cv2.imread(left_image)
+        if left_img is not None:
+            left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGB)
+        else:
+            # Try with imageio as fallback
+            left_img = imageio.imread(left_image)
+            if len(left_img.shape) == 3 and left_img.shape[2] == 4:
+                left_img = left_img[:, :, :3]
+        # Load right image
+        if not os.path.exists(right_image):
+            logging.error(f"Right image file does not exist: {right_image}")
+            return None, f"❌ Right image file not found: {right_image}"
+        logging.info(f"Loading right image from: {right_image}")
+        right_img = cv2.imread(right_image)
+        if right_img is not None:
+            right_img = cv2.cvtColor(right_img, cv2.COLOR_BGR2RGB)
+        else:
+            # Try with imageio as fallback
+            right_img = imageio.imread(right_image)
+            if len(right_img.shape) == 3 and right_img.shape[2] == 4:
+                right_img = right_img[:, :, :3]
+        logging.info(f"Images loaded successfully - Left: {left_img.shape}, Right: {right_img.shape}")
+    except Exception as e:
+        logging.error(f"Failed to load images: {e}")
+        return None, f"❌ Failed to load images: {str(e)}"
+    try:
+        # Get cached model
+        variant_name = model_selection.split('(')[0].strip() if '(' in model_selection else model_selection
+        progress(0.1, desc=f"Loading cached model ({variant_name})...")
+        logging.info("🚀 Getting cached model...")
+        model, device = get_cached_model(model_selection)
+        logging.info("✅ Cached model loaded successfully")
+        progress(0.2, desc="Preprocessing images...")
+        # Validate input images
+        if left_img.shape != right_img.shape:
+            return None, "❌ Left and right images must have the same dimensions."
+        H, W = left_img.shape[:2]
+        progress(0.5, desc="Running inference...")
+        # Process stereo pair
+        torch.cuda.empty_cache()  # Clear any cached memory before inference
+        disp_cpu = inference(left_img, right_img, model, device, n_iter=20)
+        progress(0.8, desc="Creating visualization...")
+        # Create visualization
+        disparity_vis = vis_disparity(disp_cpu)
+        result_image = disparity_vis
+        progress(1.0, desc="Complete!")
+        # Create status message
+        valid_mask = ~np.isinf(disp_cpu)
+        min_disp = disp_cpu[valid_mask].min() if valid_mask.any() else 0
+        max_disp = disp_cpu[valid_mask].max() if valid_mask.any() else 0
+        mean_disp = disp_cpu[valid_mask].mean() if valid_mask.any() else 0
+        # Get model variant for status
+        variant = variant_name
+        # Check current memory usage
+        try:
+            current_memory = torch.cuda.memory_allocated(0) / 1024**3
+            max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+            memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        except:
+            memory_info = ""
+        status = f"""✅ Processing successful!
+🔧 Model: {variant}{memory_info}
+📊 Disparity Statistics:
+   • Range: {min_disp:.2f} - {max_disp:.2f}
+   • Mean: {mean_disp:.2f}
+   • Input size: {W}×{H}
+   • Valid pixels: {valid_mask.sum()}/{valid_mask.size}"""
+        return result_image, status
+    except Exception as e:
+        logging.error(f"Processing failed: {e}")
+        # Clean up GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, f"❌ Error: {str(e)}"
+# Fixed with static duration
+@spaces.GPU(duration=120)  # Static 120 seconds for depth processing
+def process_with_depth(model_selection: str, left_image: str, right_image: str,
+                      camera_matrix: str, baseline: float,
+                      progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], Optional[str], Optional[str], str]:
+    """
+    Process stereo pair and generate depth map and point cloud (with model caching)
+    """
+    # Import Open3D
+    global OPEN3D_AVAILABLE
+    try:
+        import open3d as o3d
+        OPEN3D_AVAILABLE = True
+    except ImportError as e:
+        logging.warning(f"Open3D not available: {e}")
+        OPEN3D_AVAILABLE = False
+        return None, None, None, "❌ Open3D not available. Point cloud generation disabled."
+    if left_image is None or right_image is None:
+        return None, None, None, "❌ Please upload both left and right images."
+    try:
+        progress(0.1, desc="Parsing camera parameters...")
+        # Parse camera matrix
+        try:
+            K_values = list(map(float, camera_matrix.strip().split()))
+            if len(K_values) != 9:
+                return None, None, None, "❌ Camera matrix must contain exactly 9 values."
+            K = np.array(K_values).reshape(3, 3)
+        except ValueError:
+            return None, None, None, "❌ Invalid camera matrix format. Use space-separated numbers."
+        if baseline <= 0:
+            return None, None, None, "❌ Baseline must be positive."
+        # First get disparity using the same process as basic function
+        disparity_result, status = process_stereo_pair(model_selection, left_image, right_image, progress)
+        if disparity_result is None:
+            return None, None, None, status
+        # Load images again for depth processing
+        left_img = cv2.imread(left_image)
+        left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGB)
+        # Get disparity from model again (we need the raw values, not the visualization)
+        model, device = get_cached_model(model_selection)
+        disp_cpu = inference(left_img, cv2.cvtColor(cv2.imread(right_image), cv2.COLOR_BGR2RGB), model, device, n_iter=20)
+        progress(0.6, desc="Converting to depth...")
+        # Remove invisible points
+        H, W = disp_cpu.shape
+        yy, xx = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
+        us_right = xx - disp_cpu
+        invalid = us_right < 0
+        disp_cpu[invalid] = np.inf
+        # Convert to depth using the formula: depth = focal_length * baseline / disparity
+        depth = K[0, 0] * baseline / disp_cpu
+        # Visualize depth
+        depth_vis = vis_disparity(depth, max_val=10.0)
+        progress(0.8, desc="Generating point cloud...")
+        # Generate point cloud
+        fx, fy = K[0, 0], K[1, 1]
+        cx, cy = K[0, 2], K[1, 2]
+        # Create coordinate meshgrids
+        u, v = np.meshgrid(np.arange(W), np.arange(H))
+        # Convert to 3D coordinates
+        valid_depth = ~np.isinf(depth)
+        z = depth[valid_depth]  # Z coordinate (depth)
+        x = (u[valid_depth] - cx) * z / fx  # X coordinate
+        y = (v[valid_depth] - cy) * z / fy  # Y coordinate
+        # Stack coordinates (X, Y, Z)
+        points = np.stack([x, y, z], axis=-1)
+        # Get corresponding colors
+        colors = left_img[valid_depth]
+        # Filter points by depth range
+        depth_mask = (z > 0) & (z <= 10.0)
+        valid_points = points[depth_mask]
+        valid_colors = colors[depth_mask]
+        if len(valid_points) == 0:
+            return depth_vis, None, None, "⚠️ No valid points generated for point cloud."
+        # Subsample points for better performance
+        if len(valid_points) > 100000:
+            indices = np.random.choice(len(valid_points), 100000, replace=False)
+            valid_points = valid_points[indices]
+            valid_colors = valid_colors[indices]
+        # Transform coordinates for proper visualization
+        transformed_points = valid_points.copy()
+        transformed_points[:, 1] = -transformed_points[:, 1]  # Flip Y axis
+        transformed_points[:, 2] = -transformed_points[:, 2]  # Flip Z axis
+        # Generate point cloud
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(transformed_points)
+        pcd.colors = o3d.utility.Vector3dVector(valid_colors / 255.0)
+        progress(1.0, desc="Complete!")
+        # Check current memory usage
+        try:
+            current_memory = torch.cuda.memory_allocated(0) / 1024**3
+            max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+            memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        except:
+            memory_info = ""
+        variant = model_selection.split('(')[0].strip() if '(' in model_selection else model_selection
+        status = f"""✅ Depth processing successful!
+🔧 Model: {variant}{memory_info}
+📊 Statistics:
+   • Valid points: {len(valid_points):,}
+   • Depth range: {z.min():.2f} - {z.max():.2f} m
+   • Baseline: {baseline} m
+   • Point cloud generated with {len(valid_points)} points
+   • 3D visualization available"""
+        return depth_vis, None, None, status
+    except Exception as e:
+        logging.error(f"Depth processing failed: {e}")
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, None, None, f"❌ Error: {str(e)}"
+def create_app() -> gr.Blocks:
+    """Create the Gradio application"""
+    # Get available models
+    try:
+        available_models = get_available_models()
+        logging.info(f"Successfully got available models: {len(available_models)} found")
+    except Exception as e:
+        logging.error(f"Failed to get available models: {e}")
+        available_models = {}
+    with gr.Blocks(
+        title="CREStereo - Stereo Depth Estimation",
+        theme=gr.themes.Soft(),
+        css="footer {visibility: hidden}",
+        delete_cache=(60, 60)
+    ) as app:
+        gr.Markdown("""
+        # 🔍 CREStereo: Practical Stereo Matching
+        Upload a pair of **rectified** stereo images to get disparity estimation using CREStereo.
+        ⚠️ **Important**: Images should be rectified (epipolar lines are horizontal) and undistorted.
+        ⚡ **GPU Powered**: Runs on CUDA-enabled GPUs for fast inference.
+        """)
+        # Instructions section
+        with gr.Accordion("📋 Instructions", open=False):
+            gr.Markdown("""
+            ## 🚀 How to Use This Demo
+            ### 🖼️ Input Requirements
+            1. **Image Format**: Upload images in JPEG or PNG format.
+            2. **Image Size**: Images should be of the same size and resolution.
+            3. **Rectification**: Ensure images are rectified (epipolar lines are horizontal) and undistorted.
+            4. **Camera Parameters**: For depth processing, provide camera matrix and baseline distance.
+            ### 📊 Using the Demo
+            1. **Select Model**: Choose the CREStereo model variant
+            2. **Upload Images**: Provide rectified stereo image pairs
+            3. **Basic Processing**: Get disparity visualization
+            4. **Advanced Processing**: Generate depth maps and 3D point clouds (requires camera parameters)
+            ### 📖 Original Work
+            This demo is based on CREStereo: Practical Stereo Matching via Cascaded Recurrent Network.
+            - **Paper**: [CREStereo: Practical Stereo Matching via Cascaded Recurrent Network](https://arxiv.org/abs/2203.11483)
+            - **Official Repository**: [https://github.com/megvii-research/CREStereo](https://github.com/megvii-research/CREStereo)
+            """)
+        # Model selection
+        with gr.Row():
+            all_choices = list(available_models.keys())
+            if not all_choices:
+                all_choices = ["No models found - Please ensure crestereo_eth3d.pth is in models/ directory"]
+            default_model = all_choices[0] if all_choices else None
+            model_selector = gr.Dropdown(
+                choices=all_choices,
+                value=default_model,
+                label="🎯 Select Model",
+                info="Choose the CREStereo model variant.",
+                interactive=True
+            )
+        with gr.Tabs():
+            # Basic stereo processing tab
+            with gr.TabItem("🖼️ Basic Stereo Processing"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input = gr.Image(
+                            label="📷 Left Image",
+                            type="filepath",
+                            height=300
+                        )
+                        right_input = gr.Image(
+                            label="📷 Right Image",
+                            type="filepath",
+                            height=300
+                        )
+                        process_btn = gr.Button(
+                            "🚀 Process Stereo Pair",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        output_image = gr.Image(
+                            label="📊 Disparity Visualization",
+                            height=400
+                        )
+                        status_text = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=8
+                        )
+                # Example images
+                examples_list = []
+                # Example 1
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png")
+                    ])
+                # Example 2
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png")
+                    ])
+                if examples_list:
+                    gr.Examples(
+                        examples=examples_list,
+                        inputs=[left_input, right_input],
+                        label="📋 Example Images"
+                    )
+            # Advanced processing with depth
+            with gr.TabItem("📐 Advanced Processing (Depth & Point Cloud)"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input_adv = gr.Image(
+                            label="📷 Left Image",
+                            type="filepath",
+                            height=250
+                        )
+                        right_input_adv = gr.Image(
+                            label="📷 Right Image",
+                            type="filepath",
+                            height=250
+                        )
+                        # Camera parameters
+                        with gr.Group():
+                            gr.Markdown("### 📹 Camera Parameters")
+                            camera_matrix_input = gr.Textbox(
+                                label="Camera Matrix (9 values: fx 0 cx 0 fy cy 0 0 1)",
+                                value="",
+                            )
+                            baseline_input = gr.Number(
+                                label="Baseline (meters)",
+                                value=None,
+                                minimum=0.001,
+                                maximum=10.0,
+                                step=0.001
+                            )
+                        process_depth_btn = gr.Button(
+                            "🔬 Process with Depth",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        depth_output = gr.Image(
+                            label="📏 Depth Visualization",
+                            height=300
+                        )
+                        pointcloud_output = gr.File(
+                            label="☁️ Point Cloud Download (.ply)",
+                            file_types=[".ply"]
+                        )
+                        status_depth = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=6
+                        )
+                # 3D Point Cloud Visualization
+                with gr.Row():
+                    pointcloud_3d = gr.Model3D(
+                        label="🌐 3D Point Cloud Viewer",
+                        clear_color=[0.0, 0.0, 0.0, 0.0],
+                        height=400
+                    )
+                # Example images for advanced processing
+                examples_advanced_list = []
+                # Try to read camera parameters from K.txt files
+                # Example 1
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    k_file = os.path.join(current_dir, "assets", "example1", "K.txt")
+                    camera_matrix_str = ""
+                    baseline_val = 0.063  # default
+                    if os.path.exists(k_file):
+                        try:
+                            with open(k_file, 'r') as f:
+                                lines = f.readlines()
+                                if len(lines) >= 1:
+                                    camera_matrix_str = lines[0].strip()
+                                if len(lines) >= 2:
+                                    baseline_val = float(lines[1].strip())
+                        except:
+                            camera_matrix_str = "754.6680908203125 0.0 489.3794860839844 0.0 754.6680908203125 265.16162109375 0.0 0.0 1.0"
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png"),
+                        camera_matrix_str,
+                        baseline_val
+                    ])
+                # Example 2
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    k_file = os.path.join(current_dir, "assets", "example2", "K.txt")
+                    camera_matrix_str = ""
+                    baseline_val = 0.537  # default
+                    if os.path.exists(k_file):
+                        try:
+                            with open(k_file, 'r') as f:
+                                lines = f.readlines()
+                                if len(lines) >= 1:
+                                    camera_matrix_str = lines[0].strip()
+                                if len(lines) >= 2:
+                                    baseline_val = float(lines[1].strip())
+                        except:
+                            camera_matrix_str = "1733.74 0.0 792.27 0.0 1733.74 541.89 0.0 0.0 1.0"
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png"),
+                        camera_matrix_str,
+                        baseline_val
+                    ])
+                if examples_advanced_list:
+                    gr.Examples(
+                        examples=examples_advanced_list,
+                        inputs=[left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                        label="📋 Example Images with Camera Parameters"
+                    )
+        # Event handlers
+        if available_models:
+            process_btn.click(
+                fn=process_stereo_pair,
+                inputs=[model_selector, left_input, right_input],
+                outputs=[output_image, status_text],
+                show_progress=True
+            )
+            if OPEN3D_AVAILABLE:
+                process_depth_btn.click(
+                    fn=process_with_depth,
+                    inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                    outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth],
+                    show_progress=True
+                )
+            else:
+                process_depth_btn.click(
+                    fn=lambda *args: (None, None, None, "❌ Open3D not available. Install with: pip install open3d"),
+                    inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                    outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth]
+                )
+        else:
+            # No models available
+            process_btn.click(
+                fn=lambda *args: (None, "❌ No models available. Please ensure crestereo_eth3d.pth is in models/ directory."),
+                inputs=[model_selector, left_input, right_input],
+                outputs=[output_image, status_text]
+            )
+            process_depth_btn.click(
+                fn=lambda *args: (None, None, None, "❌ No models available. Please ensure crestereo_eth3d.pth is in models/ directory."),
+                inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth]
+            )
+        # Citation section at the bottom
+        with gr.Accordion("📖 Citation", open=False):
+            gr.Markdown("""
+            ### 📄 Please Cite the Original Paper
+            If you use this work in your research, please cite:
+            ```bibtex
+            @article{li2022practical,
+              title={Practical Stereo Matching via Cascaded Recurrent Network with Adaptive Correlation},
+              author={Li, Jiankun and Wang, Peisen and Xiong, Pengfei and Cai, Tao and Yan, Ziwei and Yang, Lei and Liu, Jiangyu and Fan, Haoqiang and Liu, Shuaicheng},
+              journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+              pages={16263--16272},
+              year={2022}
+            }
+            ```
+            """)
+        # Footer
+        gr.Markdown("""
+        ---
+        ### 📝 Notes:
+        - **Input images must be rectified stereo pairs** (epipolar lines are horizontal)
+        - **⚡ GPU Acceleration**: Requires CUDA-compatible GPU
+        - **📦 Model Caching**: Models are cached for efficient repeated usage
+        - For best results, use high-quality rectified stereo pairs
+        - Model works on RGB images and supports various resolutions
+        ### 🔗 References:
+        - [CREStereo Paper](https://arxiv.org/abs/2203.11483)
+        - [Original GitHub Repository](https://github.com/megvii-research/CREStereo)
+        - [This PyTorch Implementation](https://github.com/ibaiGorordo/CREStereo-Pytorch)
+        """)
+    return app
+def main():
+    """Main function to launch the app"""
+    # Ensure no CUDA operations during startup
+    if torch.cuda.is_available():
+        logging.warning("CUDA detected during startup - this should not happen in ZeroGPU")
+    logging.info("🚀 Starting CREStereo Gradio App...")
+    # Parse command line arguments
+    import argparse
+    parser = argparse.ArgumentParser(description="CREStereo Gradio App")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
+    parser.add_argument("--share", action="store_true", help="Create shareable link")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    try:
+        # Create and launch app
+        logging.info("Creating Gradio app...")
+        app = create_app()
+        logging.info("✅ Gradio app created successfully")
+        logging.info(f"Launching app on {args.host}:{args.port}")
+        if args.share:
+            logging.info("Share link will be created")
+        # For ZeroGPU compatibility, launch with appropriate settings
+        app.launch(
+            server_name=args.host,
+            server_port=args.port,
+            share=args.share,
+            show_error=True,
+            favicon_path=None,
+            ssr_mode=False,  # Disable SSR for ZeroGPU compatibility
+            allowed_paths=["./"]  # Allow access to local files
+        )
+    except Exception as e:
+        logging.error(f"Failed to launch app: {e}")
+        raise
+if __name__ == "__main__":
+    # Additional safety check for ZeroGPU environment
+    if 'SPACE_ID' in os.environ:
+        logging.info("Running in Hugging Face Spaces environment")
+    # Do not check CUDA status during startup - this can trigger CUDA initialization
+    # The CUDA status will be checked inside the @spaces.GPU decorated functions
+    logging.info("✅ CUDA status will be checked within GPU-decorated functions")
+    main()

CREStereo_demo/app_local.py ADDED Viewed

	@@ -0,0 +1,889 @@

+import os
+import sys
+import logging
+import tempfile
+import gc
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import numpy as np
+import cv2
+import gradio as gr
+import imageio
+import torch
+import torch.nn.functional as F
+# Set default tensor type if needed
+# torch.set_default_tensor_type('torch.FloatTensor')
+# CUDA backend settings
+# torch.backends.cudnn.enabled = False
+# torch.backends.cudnn.benchmark = False
+# Use current directory as base
+current_dir = os.path.dirname(os.path.abspath(__file__))
+base_dir = current_dir
+# Add current directory to path for local imports
+sys.path.insert(0, current_dir)
+# Import local modules
+from nets import Model
+# Import Open3D with error handling
+OPEN3D_AVAILABLE = False
+try:
+    # Set Open3D to CPU mode to avoid CUDA initialization
+    os.environ['OPEN3D_CPU_RENDERING'] = '1'
+    # Don't import open3d here - do it inside functions
+    # import open3d as o3d
+    OPEN3D_AVAILABLE = True  # Assume available, will check later
+except Exception as e:
+    logging.warning(f"Open3D setup failed: {e}")
+    OPEN3D_AVAILABLE = False
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Model configuration
+MODEL_VARIANTS = {
+    "crestereo_eth3d": {
+        "display_name": "CREStereo ETH3D (Pre-trained model)",
+        "model_file": "models/crestereo_eth3d.pth",
+        "max_disp": 256
+    }
+}
+# Global variables for model caching
+_cached_model = None
+_cached_device = None
+_cached_model_selection = None
+class InputPadder:
+    """ Pads images such that dimensions are divisible by divis_by """
+    def __init__(self, dims, divis_by=8, force_square=False):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by
+        pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by
+        if force_square:
+            # Make the padded dimensions square
+            max_dim = max(self.ht + pad_ht, self.wd + pad_wd)
+            pad_ht = max_dim - self.ht
+            pad_wd = max_dim - self.wd
+        self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+def aggressive_cleanup():
+    """Perform basic cleanup"""
+    import gc
+    gc.collect()
+    logging.info("Performed basic memory cleanup")
+def check_gpu_memory():
+    """Check and log current GPU memory usage"""
+    try:
+        allocated = torch.cuda.memory_allocated(0) / 1024**3
+        reserved = torch.cuda.memory_reserved(0) / 1024**3
+        max_allocated = torch.cuda.max_memory_allocated(0) / 1024**3
+        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+        logging.info(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Max: {max_allocated:.2f}GB, Total: {total:.2f}GB")
+        return allocated, reserved, max_allocated, total
+    except RuntimeError as e:
+        logging.warning(f"Failed to get GPU memory info: {e}")
+        return None, None, None, None
+def get_available_models() -> dict:
+    """Get all available models with their display names"""
+    models = {}
+    # Check for local models
+    for variant, info in MODEL_VARIANTS.items():
+        model_path = os.path.join(current_dir, info["model_file"])
+        if os.path.exists(model_path):
+            display_name = info["display_name"]
+            models[display_name] = {
+                "model_path": model_path,
+                "variant": variant,
+                "max_disp": info["max_disp"],
+                "source": "local"
+            }
+    return models
+def get_model_paths_from_selection(model_selection: str) -> Tuple[Optional[str], Optional[dict]]:
+    """Get model path and config from the selected model"""
+    models = get_available_models()
+    # Check if it's in our models dict
+    if model_selection in models:
+        model_info = models[model_selection]
+        logging.info(f"📁 Using local model: {model_selection}")
+        return model_info["model_path"], model_info
+    return None, None
+def load_model_for_inference(model_path: str, model_info: dict):
+    """Load CREStereo model for inference"""
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available.")
+    # Use the first available CUDA device
+    device = torch.device("cuda")
+    try:
+        # Create model
+        max_disp = model_info.get("max_disp", 256)
+        model = Model(max_disp=max_disp, mixed_precision=False, test_mode=True)
+        # Load checkpoint
+        ckpt = torch.load(model_path, map_location=device)
+        model.load_state_dict(ckpt, strict=True)
+        model.to(device)
+        model.eval()
+        logging.info("Loaded CREStereo model weights")
+        # Memory optimizations
+        torch.set_grad_enabled(False)
+        logging.info("Applied memory optimizations")
+        return model, device
+    except Exception as e:
+        logging.error(f"Model loading failed: {e}")
+        raise RuntimeError(f"Failed to load model: {e}")
+def get_cached_model(model_selection: str):
+    """Get cached model or load new one if selection changed"""
+    global _cached_model, _cached_device, _cached_model_selection
+    # Get model paths from selection
+    model_path, model_info = get_model_paths_from_selection(model_selection)
+    if model_path is None or model_info is None:
+        raise ValueError(f"Selected model not found: {model_selection}")
+    # Check if we need to reload the model
+    if (_cached_model is None or
+        _cached_model_selection != model_selection):
+        # Clear previous model if exists
+        if _cached_model is not None:
+            del _cached_model
+            torch.cuda.empty_cache()
+            gc.collect()
+        logging.info(f"🚀 Loading model: {model_selection}")
+        _cached_model, _cached_device = load_model_for_inference(model_path, model_info)
+        _cached_model_selection = model_selection
+        logging.info(f"✅ Model loaded successfully: {model_selection}")
+    else:
+        logging.info(f"✅ Using cached model: {model_selection}")
+    return _cached_model, _cached_device
+def clear_model_cache():
+    """Clear the cached model to free memory"""
+    global _cached_model, _cached_device, _cached_model_selection
+    if _cached_model is not None:
+        logging.info("Clearing model cache...")
+        del _cached_model
+        _cached_model = None
+        _cached_device = None
+        _cached_model_selection = None
+        # Simple cleanup
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+        logging.info("Model cache cleared")
+    else:
+        logging.info("No model in cache to clear")
+def inference(left, right, model, device, n_iter=20):
+    """Run CREStereo inference on stereo pair"""
+    print("Model Forwarding...")
+    imgL = left.transpose(2, 0, 1)
+    imgR = right.transpose(2, 0, 1)
+    imgL = np.ascontiguousarray(imgL[None, :, :, :])
+    imgR = np.ascontiguousarray(imgR[None, :, :, :])
+    imgL = torch.tensor(imgL.astype("float32")).to(device)
+    imgR = torch.tensor(imgR.astype("float32")).to(device)
+    # Use InputPadder to handle any image size
+    padder = InputPadder(imgL.shape, divis_by=8)
+    imgL_padded, imgR_padded = padder.pad(imgL, imgR)
+    # Downsample for coarse prediction
+    imgL_dw2 = F.interpolate(
+        imgL_padded,
+        size=(imgL_padded.shape[2] // 2, imgL_padded.shape[3] // 2),
+        mode="bilinear",
+        align_corners=True,
+    )
+    imgR_dw2 = F.interpolate(
+        imgR_padded,
+        size=(imgL_padded.shape[2] // 2, imgL_padded.shape[3] // 2),
+        mode="bilinear",
+        align_corners=True,
+    )
+    with torch.inference_mode():
+        pred_flow_dw2 = model(imgL_dw2, imgR_dw2, iters=n_iter, flow_init=None)
+        pred_flow = model(imgL_padded, imgR_padded, iters=n_iter, flow_init=pred_flow_dw2)
+    # Unpad the result to original dimensions
+    pred_flow = padder.unpad(pred_flow)
+    pred_disp = torch.squeeze(pred_flow[:, 0, :, :]).cpu().detach().numpy()
+    return pred_disp
+def vis_disparity(disparity_map, max_val=None):
+    """Visualize disparity map"""
+    if max_val is None:
+        disp_vis = (disparity_map - disparity_map.min()) / (disparity_map.max() - disparity_map.min()) * 255.0
+    else:
+        disp_vis = np.clip(disparity_map / max_val * 255.0, 0, 255)
+    disp_vis = disp_vis.astype("uint8")
+    disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO)
+    disp_vis = cv2.cvtColor(disp_vis, cv2.COLOR_BGR2RGB)
+    return disp_vis
+def process_stereo_pair(model_selection: str, left_image: str, right_image: str,
+                       progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], str]:
+    """
+    Main processing function for stereo pair (with model caching)
+    """
+    logging.info("Starting stereo pair processing...")
+    if left_image is None or right_image is None:
+        return None, "❌ Please upload both left and right images."
+    # Convert image paths to numpy arrays
+    logging.info(f"Loading images: left={left_image}, right={right_image}")
+    try:
+        # Load left image
+        if not os.path.exists(left_image):
+            logging.error(f"Left image file does not exist: {left_image}")
+            return None, f"❌ Left image file not found: {left_image}"
+        logging.info(f"Loading left image from: {left_image}")
+        left_img = cv2.imread(left_image)
+        if left_img is not None:
+            left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGB)
+        else:
+            # Try with imageio as fallback
+            left_img = imageio.imread(left_image)
+            if len(left_img.shape) == 3 and left_img.shape[2] == 4:
+                left_img = left_img[:, :, :3]
+        # Load right image
+        if not os.path.exists(right_image):
+            logging.error(f"Right image file does not exist: {right_image}")
+            return None, f"❌ Right image file not found: {right_image}"
+        logging.info(f"Loading right image from: {right_image}")
+        right_img = cv2.imread(right_image)
+        if right_img is not None:
+            right_img = cv2.cvtColor(right_img, cv2.COLOR_BGR2RGB)
+        else:
+            # Try with imageio as fallback
+            right_img = imageio.imread(right_image)
+            if len(right_img.shape) == 3 and right_img.shape[2] == 4:
+                right_img = right_img[:, :, :3]
+        logging.info(f"Images loaded successfully - Left: {left_img.shape}, Right: {right_img.shape}")
+    except Exception as e:
+        logging.error(f"Failed to load images: {e}")
+        return None, f"❌ Failed to load images: {str(e)}"
+    try:
+        # Get cached model
+        variant_name = model_selection.split('(')[0].strip() if '(' in model_selection else model_selection
+        progress(0.1, desc=f"Loading cached model ({variant_name})...")
+        logging.info("🚀 Getting cached model...")
+        model, device = get_cached_model(model_selection)
+        logging.info("✅ Cached model loaded successfully")
+        progress(0.2, desc="Preprocessing images...")
+        # Validate input images
+        if left_img.shape != right_img.shape:
+            return None, "❌ Left and right images must have the same dimensions."
+        H, W = left_img.shape[:2]
+        progress(0.5, desc="Running inference...")
+        # Process stereo pair
+        torch.cuda.empty_cache()  # Clear any cached memory before inference
+        disp_cpu = inference(left_img, right_img, model, device, n_iter=20)
+        progress(0.8, desc="Creating visualization...")
+        # Create visualization
+        disparity_vis = vis_disparity(disp_cpu)
+        result_image = disparity_vis
+        progress(1.0, desc="Complete!")
+        # Create status message
+        valid_mask = ~np.isinf(disp_cpu)
+        min_disp = disp_cpu[valid_mask].min() if valid_mask.any() else 0
+        max_disp = disp_cpu[valid_mask].max() if valid_mask.any() else 0
+        mean_disp = disp_cpu[valid_mask].mean() if valid_mask.any() else 0
+        # Get model variant for status
+        variant = variant_name
+        # Check current memory usage
+        try:
+            current_memory = torch.cuda.memory_allocated(0) / 1024**3
+            max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+            memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        except:
+            memory_info = ""
+        status = f"""✅ Processing successful!
+🔧 Model: {variant}{memory_info}
+📊 Disparity Statistics:
+   • Range: {min_disp:.2f} - {max_disp:.2f}
+   • Mean: {mean_disp:.2f}
+   • Input size: {W}×{H}
+   • Valid pixels: {valid_mask.sum()}/{valid_mask.size}"""
+        return result_image, status
+    except Exception as e:
+        logging.error(f"Processing failed: {e}")
+        # Clean up GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, f"❌ Error: {str(e)}"
+def process_with_depth(model_selection: str, left_image: str, right_image: str,
+                      camera_matrix: str, baseline: float,
+                      progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], Optional[str], Optional[str], str]:
+    """
+    Process stereo pair and generate depth map and point cloud (with model caching)
+    """
+    # Import Open3D
+    global OPEN3D_AVAILABLE
+    try:
+        import open3d as o3d
+        OPEN3D_AVAILABLE = True
+    except ImportError as e:
+        logging.warning(f"Open3D not available: {e}")
+        OPEN3D_AVAILABLE = False
+        return None, None, None, "❌ Open3D not available. Point cloud generation disabled."
+    if left_image is None or right_image is None:
+        return None, None, None, "❌ Please upload both left and right images."
+    try:
+        progress(0.1, desc="Parsing camera parameters...")
+        # Parse camera matrix
+        try:
+            K_values = list(map(float, camera_matrix.strip().split()))
+            if len(K_values) != 9:
+                return None, None, None, "❌ Camera matrix must contain exactly 9 values."
+            K = np.array(K_values).reshape(3, 3)
+        except ValueError:
+            return None, None, None, "❌ Invalid camera matrix format. Use space-separated numbers."
+        if baseline <= 0:
+            return None, None, None, "❌ Baseline must be positive."
+        # First get disparity using the same process as basic function
+        disparity_result, status = process_stereo_pair(model_selection, left_image, right_image, progress)
+        if disparity_result is None:
+            return None, None, None, status
+        # Load images again for depth processing
+        left_img = cv2.imread(left_image)
+        left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGB)
+        # Get disparity from model again (we need the raw values, not the visualization)
+        model, device = get_cached_model(model_selection)
+        disp_cpu = inference(left_img, cv2.cvtColor(cv2.imread(right_image), cv2.COLOR_BGR2RGB), model, device, n_iter=20)
+        progress(0.6, desc="Converting to depth...")
+        # Remove invisible points
+        H, W = disp_cpu.shape
+        yy, xx = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
+        us_right = xx - disp_cpu
+        invalid = us_right < 0
+        disp_cpu[invalid] = np.inf
+        # Convert to depth using the formula: depth = focal_length * baseline / disparity
+        depth = K[0, 0] * baseline / disp_cpu
+        # Visualize depth
+        depth_vis = vis_disparity(depth, max_val=10.0)
+        progress(0.8, desc="Generating point cloud...")
+        # Generate point cloud
+        fx, fy = K[0, 0], K[1, 1]
+        cx, cy = K[0, 2], K[1, 2]
+        # Create coordinate meshgrids
+        u, v = np.meshgrid(np.arange(W), np.arange(H))
+        # Convert to 3D coordinates
+        valid_depth = ~np.isinf(depth)
+        z = depth[valid_depth]  # Z coordinate (depth)
+        x = (u[valid_depth] - cx) * z / fx  # X coordinate
+        y = (v[valid_depth] - cy) * z / fy  # Y coordinate
+        # Stack coordinates (X, Y, Z)
+        points = np.stack([x, y, z], axis=-1)
+        # Get corresponding colors
+        colors = left_img[valid_depth]
+        # Filter points by depth range
+        depth_mask = (z > 0) & (z <= 10.0)
+        valid_points = points[depth_mask]
+        valid_colors = colors[depth_mask]
+        if len(valid_points) == 0:
+            return depth_vis, None, None, "⚠️ No valid points generated for point cloud."
+        # Subsample points for better performance
+        if len(valid_points) > 100000:
+            indices = np.random.choice(len(valid_points), 100000, replace=False)
+            valid_points = valid_points[indices]
+            valid_colors = valid_colors[indices]
+        # Transform coordinates for proper visualization
+        transformed_points = valid_points.copy()
+        transformed_points[:, 1] = -transformed_points[:, 1]  # Flip Y axis
+        transformed_points[:, 2] = -transformed_points[:, 2]  # Flip Z axis
+        # Generate point cloud
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(transformed_points)
+        pcd.colors = o3d.utility.Vector3dVector(valid_colors / 255.0)
+        progress(1.0, desc="Complete!")
+        # Check current memory usage
+        try:
+            current_memory = torch.cuda.memory_allocated(0) / 1024**3
+            max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+            memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        except:
+            memory_info = ""
+        variant = model_selection.split('(')[0].strip() if '(' in model_selection else model_selection
+        status = f"""✅ Depth processing successful!
+🔧 Model: {variant}{memory_info}
+📊 Statistics:
+   • Valid points: {len(valid_points):,}
+   • Depth range: {z.min():.2f} - {z.max():.2f} m
+   • Baseline: {baseline} m
+   • Point cloud generated with {len(valid_points)} points
+   • 3D visualization available"""
+        return depth_vis, None, None, status
+    except Exception as e:
+        logging.error(f"Depth processing failed: {e}")
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, None, None, f"❌ Error: {str(e)}"
+def create_app() -> gr.Blocks:
+    """Create the Gradio application"""
+    # Get available models
+    try:
+        available_models = get_available_models()
+        logging.info(f"Successfully got available models: {len(available_models)} found")
+    except Exception as e:
+        logging.error(f"Failed to get available models: {e}")
+        available_models = {}
+    with gr.Blocks(
+        title="CREStereo - Stereo Depth Estimation",
+        theme=gr.themes.Soft(),
+        css="footer {visibility: hidden}",
+        delete_cache=(60, 60)
+    ) as app:
+        gr.Markdown("""
+        # 🔍 CREStereo: Practical Stereo Matching
+        Upload a pair of **rectified** stereo images to get disparity estimation using CREStereo.
+        ⚠️ **Important**: Images should be rectified (epipolar lines are horizontal) and undistorted.
+        ⚡ **GPU Powered**: Runs on CUDA-enabled GPUs for fast inference.
+        """)
+        # Instructions section
+        with gr.Accordion("📋 Instructions", open=False):
+            gr.Markdown("""
+            ## 🚀 How to Use This Demo
+            ### 🖼️ Input Requirements
+            1. **Image Format**: Upload images in JPEG or PNG format.
+            2. **Image Size**: Images should be of the same size and resolution.
+            3. **Rectification**: Ensure images are rectified (epipolar lines are horizontal) and undistorted.
+            4. **Camera Parameters**: For depth processing, provide camera matrix and baseline distance.
+            ### 📊 Using the Demo
+            1. **Select Model**: Choose the CREStereo model variant
+            2. **Upload Images**: Provide rectified stereo image pairs
+            3. **Basic Processing**: Get disparity visualization
+            4. **Advanced Processing**: Generate depth maps and 3D point clouds (requires camera parameters)
+            ### 📖 Original Work
+            This demo is based on CREStereo: Practical Stereo Matching via Cascaded Recurrent Network.
+            - **Paper**: [CREStereo: Practical Stereo Matching via Cascaded Recurrent Network](https://arxiv.org/abs/2203.11483)
+            - **Official Repository**: [https://github.com/megvii-research/CREStereo](https://github.com/megvii-research/CREStereo)
+            """)
+        # Model selection
+        with gr.Row():
+            all_choices = list(available_models.keys())
+            if not all_choices:
+                all_choices = ["No models found - Please ensure crestereo_eth3d.pth is in models/ directory"]
+            default_model = all_choices[0] if all_choices else None
+            model_selector = gr.Dropdown(
+                choices=all_choices,
+                value=default_model,
+                label="🎯 Select Model",
+                info="Choose the CREStereo model variant.",
+                interactive=True
+            )
+        with gr.Tabs():
+            # Basic stereo processing tab
+            with gr.TabItem("🖼️ Basic Stereo Processing"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input = gr.Image(
+                            label="📷 Left Image",
+                            type="filepath",
+                            height=300
+                        )
+                        right_input = gr.Image(
+                            label="📷 Right Image",
+                            type="filepath",
+                            height=300
+                        )
+                        process_btn = gr.Button(
+                            "🚀 Process Stereo Pair",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        output_image = gr.Image(
+                            label="📊 Disparity Visualization",
+                            height=400
+                        )
+                        status_text = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=8
+                        )
+                # Example images
+                examples_list = []
+                # Example 1
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png")
+                    ])
+                # Example 2
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png")
+                    ])
+                if examples_list:
+                    gr.Examples(
+                        examples=examples_list,
+                        inputs=[left_input, right_input],
+                        label="📋 Example Images"
+                    )
+            # Advanced processing with depth
+            with gr.TabItem("📐 Advanced Processing (Depth & Point Cloud)"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input_adv = gr.Image(
+                            label="📷 Left Image",
+                            type="filepath",
+                            height=250
+                        )
+                        right_input_adv = gr.Image(
+                            label="📷 Right Image",
+                            type="filepath",
+                            height=250
+                        )
+                        # Camera parameters
+                        with gr.Group():
+                            gr.Markdown("### 📹 Camera Parameters")
+                            camera_matrix_input = gr.Textbox(
+                                label="Camera Matrix (9 values: fx 0 cx 0 fy cy 0 0 1)",
+                                value="",
+                            )
+                            baseline_input = gr.Number(
+                                label="Baseline (meters)",
+                                value=None,
+                                minimum=0.001,
+                                maximum=10.0,
+                                step=0.001
+                            )
+                        process_depth_btn = gr.Button(
+                            "🔬 Process with Depth",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        depth_output = gr.Image(
+                            label="📏 Depth Visualization",
+                            height=300
+                        )
+                        pointcloud_output = gr.File(
+                            label="☁️ Point Cloud Download (.ply)",
+                            file_types=[".ply"]
+                        )
+                        status_depth = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=6
+                        )
+                # 3D Point Cloud Visualization
+                with gr.Row():
+                    pointcloud_3d = gr.Model3D(
+                        label="🌐 3D Point Cloud Viewer",
+                        clear_color=[0.0, 0.0, 0.0, 0.0],
+                        height=400
+                    )
+                # Example images for advanced processing
+                examples_advanced_list = []
+                # Try to read camera parameters from K.txt files
+                # Example 1
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    k_file = os.path.join(current_dir, "assets", "example1", "K.txt")
+                    camera_matrix_str = ""
+                    baseline_val = 0.063  # default
+                    if os.path.exists(k_file):
+                        try:
+                            with open(k_file, 'r') as f:
+                                lines = f.readlines()
+                                if len(lines) >= 1:
+                                    camera_matrix_str = lines[0].strip()
+                                if len(lines) >= 2:
+                                    baseline_val = float(lines[1].strip())
+                        except:
+                            camera_matrix_str = "754.6680908203125 0.0 489.3794860839844 0.0 754.6680908203125 265.16162109375 0.0 0.0 1.0"
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png"),
+                        camera_matrix_str,
+                        baseline_val
+                    ])
+                # Example 2
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    k_file = os.path.join(current_dir, "assets", "example2", "K.txt")
+                    camera_matrix_str = ""
+                    baseline_val = 0.537  # default
+                    if os.path.exists(k_file):
+                        try:
+                            with open(k_file, 'r') as f:
+                                lines = f.readlines()
+                                if len(lines) >= 1:
+                                    camera_matrix_str = lines[0].strip()
+                                if len(lines) >= 2:
+                                    baseline_val = float(lines[1].strip())
+                        except:
+                            camera_matrix_str = "1733.74 0.0 792.27 0.0 1733.74 541.89 0.0 0.0 1.0"
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png"),
+                        camera_matrix_str,
+                        baseline_val
+                    ])
+                if examples_advanced_list:
+                    gr.Examples(
+                        examples=examples_advanced_list,
+                        inputs=[left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                        label="📋 Example Images with Camera Parameters"
+                    )
+        # Event handlers
+        if available_models:
+            process_btn.click(
+                fn=process_stereo_pair,
+                inputs=[model_selector, left_input, right_input],
+                outputs=[output_image, status_text],
+                show_progress=True
+            )
+            if OPEN3D_AVAILABLE:
+                process_depth_btn.click(
+                    fn=process_with_depth,
+                    inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                    outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth],
+                    show_progress=True
+                )
+            else:
+                process_depth_btn.click(
+                    fn=lambda *args: (None, None, None, "❌ Open3D not available. Install with: pip install open3d"),
+                    inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                    outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth]
+                )
+        else:
+            # No models available
+            process_btn.click(
+                fn=lambda *args: (None, "❌ No models available. Please ensure crestereo_eth3d.pth is in models/ directory."),
+                inputs=[model_selector, left_input, right_input],
+                outputs=[output_image, status_text]
+            )
+            process_depth_btn.click(
+                fn=lambda *args: (None, None, None, "❌ No models available. Please ensure crestereo_eth3d.pth is in models/ directory."),
+                inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth]
+            )
+        # Citation section at the bottom
+        with gr.Accordion("📖 Citation", open=False):
+            gr.Markdown("""
+            ### 📄 Please Cite the Original Paper
+            If you use this work in your research, please cite:
+            ```bibtex
+            @article{li2022practical,
+              title={Practical Stereo Matching via Cascaded Recurrent Network with Adaptive Correlation},
+              author={Li, Jiankun and Wang, Peisen and Xiong, Pengfei and Cai, Tao and Yan, Ziwei and Yang, Lei and Liu, Jiangyu and Fan, Haoqiang and Liu, Shuaicheng},
+              journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+              pages={16263--16272},
+              year={2022}
+            }
+            ```
+            """)
+        # Footer
+        gr.Markdown("""
+        ---
+        ### 📝 Notes:
+        - **Input images must be rectified stereo pairs** (epipolar lines are horizontal)
+        - **⚡ GPU Acceleration**: Requires CUDA-compatible GPU
+        - **📦 Model Caching**: Models are cached for efficient repeated usage
+        - For best results, use high-quality rectified stereo pairs
+        - Model works on RGB images and supports various resolutions
+        ### 🔗 References:
+        - [CREStereo Paper](https://arxiv.org/abs/2203.11483)
+        - [Original GitHub Repository](https://github.com/megvii-research/CREStereo)
+        - [This PyTorch Implementation](https://github.com/ibaiGorordo/CREStereo-Pytorch)
+        """)
+    return app
+def main():
+    """Main function to launch the app"""
+    logging.info("🚀 Starting CREStereo Gradio App...")
+    # Parse command line arguments
+    import argparse
+    parser = argparse.ArgumentParser(description="CREStereo Gradio App")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    try:
+        # Create and launch app
+        logging.info("Creating Gradio app...")
+        app = create_app()
+        logging.info("✅ Gradio app created successfully")
+        logging.info(f"Launching app on {args.host}:{args.port}")
+        # Launch with appropriate settings
+        app.launch(
+            server_name=args.host,
+            server_port=args.port,
+            share=False,
+            show_error=True,
+            favicon_path=None,
+            ssr_mode=False,
+            allowed_paths=["./"]
+        )
+    except Exception as e:
+        logging.error(f"Failed to launch app: {e}")
+        raise
+if __name__ == "__main__":
+    main()

CREStereo_demo/models/.gitkeep ADDED Viewed

File without changes

CREStereo_demo/models/crestereo_eth3d.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2271ab615015a73edd4759b0f7b25a4d82ffb654270b92d3811237da3d63aa6d
+size 21763979

CREStereo_demo/nets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .crestereo import CREStereo as Model

CREStereo_demo/nets/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .transformer import LocalFeatureTransformer
2	+ from .position_encoding import PositionEncodingSine

CREStereo_demo/nets/attention/linear_attention.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention"
+Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+"""
+import torch
+from torch.nn import Module, Dropout
+def elu_feature_map(x):
+    return torch.nn.functional.elu(x) + 1
+class LinearAttention(Module):
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.feature_map = elu_feature_map
+        self.eps = eps
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-Head linear attention proposed in "Transformers are RNNs"
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        Q = self.feature_map(queries)
+        K = self.feature_map(keys)
+        # set padded position to zero
+        if q_mask is not None:
+            Q = Q * q_mask[:, :, None, None]
+        if kv_mask is not None:
+            K = K * kv_mask[:, :, None, None]
+            values = values * kv_mask[:, :, None, None]
+        v_length = values.size(1)
+        values = values / v_length  # prevent fp16 overflow
+        KV = torch.einsum("nshd,nshv->nhdv", K, values)  # (S,D)' @ S,V
+        Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps)
+        queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
+        return queried_values.contiguous()
+class FullAttention(Module):
+    def __init__(self, use_dropout=False, attention_dropout=0.1):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.dropout = Dropout(attention_dropout)
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-head scaled dot-product attention, a.k.a full attention.
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum("nlhd,nshd->nlsh", queries, keys)
+        if kv_mask is not None:
+            QK.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float('-inf'))
+        # Compute the attention and the weighted average
+        softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
+        A = torch.softmax(softmax_temp * QK, dim=2)
+        if self.use_dropout:
+            A = self.dropout(A)
+        queried_values = torch.einsum("nlsh,nshd->nlhd", A, values)
+        return queried_values.contiguous()

CREStereo_demo/nets/attention/position_encoding.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import math
+import torch
+from torch import nn
+class PositionEncodingSine(nn.Module):
+    """
+    This is a sinusoidal position encoding that generalized to 2-dimensional images
+    """
+    def __init__(self, d_model, max_shape=(256, 256), temp_bug_fix=False):
+        """
+        Args:
+            max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels
+            temp_bug_fix (bool): As noted in this [issue](https://github.com/zju3dv/LoFTR/issues/41),
+                the original implementation of LoFTR includes a bug in the pos-enc impl, which has little impact
+                on the final performance. For now, we keep both impls for backward compatability.
+                We will remove the buggy impl after re-training all variants of our released models.
+        """
+        super().__init__()
+        pe = torch.zeros((d_model, *max_shape))
+        y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0)
+        x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0)
+        if temp_bug_fix:
+            div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / (d_model//2)))
+        else:  # a buggy implementation (for backward compatability only)
+            div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / d_model//2))
+        div_term = div_term[:, None, None]  # [C//4, 1, 1]
+        pe[0::4, :, :] = torch.sin(x_position * div_term)
+        pe[1::4, :, :] = torch.cos(x_position * div_term)
+        pe[2::4, :, :] = torch.sin(y_position * div_term)
+        pe[3::4, :, :] = torch.cos(y_position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0), persistent=False)  # [1, C, H, W]
+    def forward(self, x):
+        """
+        Args:
+            x: [N, C, H, W]
+        """
+        return x + self.pe[:, :, :x.size(2), :x.size(3)].to(x.device)

CREStereo_demo/nets/attention/transformer.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import copy
+import torch
+import torch.nn as nn
+from .linear_attention import LinearAttention, FullAttention
+#Ref: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+class LoFTREncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 attention='linear'):
+        super(LoFTREncoderLayer, self).__init__()
+        self.dim = d_model // nhead
+        self.nhead = nhead
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.attention = LinearAttention() if attention == 'linear' else FullAttention()
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+        # feed-forward network
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model*2, d_model*2, bias=False),
+            nn.ReLU(),
+            nn.Linear(d_model*2, d_model, bias=False),
+        )
+        # norm and dropout
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+    def forward(self, x, source, x_mask=None, source_mask=None):
+        """
+        Args:
+            x (torch.Tensor): [N, L, C]
+            source (torch.Tensor): [N, S, C]
+            x_mask (torch.Tensor): [N, L] (optional)
+            source_mask (torch.Tensor): [N, S] (optional)
+        """
+        bs = x.size(0)
+        query, key, value = x, source, source
+        # multi-head attention
+        query = self.q_proj(query).view(bs, -1, self.nhead, self.dim)  # [N, L, (H, D)]
+        key = self.k_proj(key).view(bs, -1, self.nhead, self.dim)  # [N, S, (H, D)]
+        value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
+        message = self.attention(query, key, value, q_mask=x_mask, kv_mask=source_mask)  # [N, L, (H, D)]
+        message = self.merge(message.view(bs, -1, self.nhead*self.dim))  # [N, L, C]
+        message = self.norm1(message)
+        # feed-forward network
+        message = self.mlp(torch.cat([x, message], dim=2))
+        message = self.norm2(message)
+        return x + message
+class LocalFeatureTransformer(nn.Module):
+    """A Local Feature Transformer (LoFTR) module."""
+    def __init__(self, d_model, nhead, layer_names, attention):
+        super(LocalFeatureTransformer, self).__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.layer_names = layer_names
+        encoder_layer = LoFTREncoderLayer(d_model, nhead, attention)
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))])
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, feat0, feat1, mask0=None, mask1=None):
+        """
+        Args:
+            feat0 (torch.Tensor): [N, L, C]
+            feat1 (torch.Tensor): [N, S, C]
+            mask0 (torch.Tensor): [N, L] (optional)
+            mask1 (torch.Tensor): [N, S] (optional)
+        """
+        assert self.d_model == feat0.size(2), "the feature number of src and transformer must be equal"
+        for layer, name in zip(self.layers, self.layer_names):
+            if name == 'self':
+                feat0 = layer(feat0, feat0, mask0, mask0)
+                feat1 = layer(feat1, feat1, mask1, mask1)
+            elif name == 'cross':
+                feat0 = layer(feat0, feat1, mask0, mask1)
+                feat1 = layer(feat1, feat0, mask1, mask0)
+            else:
+                raise KeyError
+        return feat0, feat1

CREStereo_demo/nets/corr.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .utils import bilinear_sampler, coords_grid, manual_pad
+class AGCL:
+    """
+    Implementation of Adaptive Group Correlation Layer (AGCL).
+    """
+    def __init__(self, fmap1, fmap2, att=None):
+        self.fmap1 = fmap1
+        self.fmap2 = fmap2
+        self.att = att
+        self.coords = coords_grid(fmap1.shape[0], fmap1.shape[2], fmap1.shape[3], fmap1.device)
+    def __call__(self, flow, extra_offset, small_patch=False, iter_mode=False):
+        if iter_mode:
+            corr = self.corr_iter(self.fmap1, self.fmap2, flow, small_patch)
+        else:
+            corr = self.corr_att_offset(
+                self.fmap1, self.fmap2, flow, extra_offset, small_patch
+            )
+        return corr
+    def get_correlation(self, left_feature, right_feature, psize=(3, 3), dilate=(1, 1)):
+        N, C, H, W = left_feature.shape
+        di_y, di_x = dilate[0], dilate[1]
+        pady, padx = psize[0] // 2 * di_y, psize[1] // 2 * di_x
+        right_pad = manual_pad(right_feature, pady, padx)
+        corr_list = []
+        for h in range(0, pady * 2 + 1, di_y):
+            for w in range(0, padx * 2 + 1, di_x):
+                right_crop = right_pad[:, :, h : h + H, w : w + W]
+                assert right_crop.shape == left_feature.shape
+                corr = torch.mean(left_feature * right_crop, dim=1, keepdims=True)
+                corr_list.append(corr)
+        corr_final = torch.cat(corr_list, dim=1)
+        return corr_final
+    def corr_iter(self, left_feature, right_feature, flow, small_patch):
+        coords = self.coords + flow
+        coords = coords.permute(0, 2, 3, 1)
+        right_feature = bilinear_sampler(right_feature, coords)
+        if small_patch:
+            psize_list = [(3, 3), (3, 3), (3, 3), (3, 3)]
+            dilate_list = [(1, 1), (1, 1), (1, 1), (1, 1)]
+        else:
+            psize_list = [(1, 9), (1, 9), (1, 9), (1, 9)]
+            dilate_list = [(1, 1), (1, 1), (1, 1), (1, 1)]
+        N, C, H, W = left_feature.shape
+        lefts = torch.split(left_feature, left_feature.shape[1]//4, dim=1)
+        rights = torch.split(right_feature, right_feature.shape[1]//4, dim=1)
+        corrs = []
+        for i in range(len(psize_list)):
+            corr = self.get_correlation(
+                lefts[i], rights[i], psize_list[i], dilate_list[i]
+            )
+            corrs.append(corr)
+        final_corr = torch.cat(corrs, dim=1)
+        return final_corr
+    def corr_att_offset(
+        self, left_feature, right_feature, flow, extra_offset, small_patch
+    ):
+        N, C, H, W = left_feature.shape
+        if self.att is not None:
+            left_feature = left_feature.permute(0, 2, 3, 1).reshape(N, H * W, C)  # 'n c h w -> n (h w) c'
+            right_feature = right_feature.permute(0, 2, 3, 1).reshape(N, H * W, C)  # 'n c h w -> n (h w) c'
+            # 'n (h w) c -> n c h w'
+            left_feature, right_feature = self.att(left_feature, right_feature)
+            # 'n (h w) c -> n c h w'
+            left_feature, right_feature = [
+                x.reshape(N, H, W, C).permute(0, 3, 1, 2)
+                for x in [left_feature, right_feature]
+            ]
+        lefts = torch.split(left_feature, left_feature.shape[1]//4, dim=1)
+        rights = torch.split(right_feature, right_feature.shape[1]//4, dim=1)
+        C = C // 4
+        if small_patch:
+            psize_list = [(3, 3), (3, 3), (3, 3), (3, 3)]
+            dilate_list = [(1, 1), (1, 1), (1, 1), (1, 1)]
+        else:
+            psize_list = [(1, 9), (1, 9), (1, 9), (1, 9)]
+            dilate_list = [(1, 1), (1, 1), (1, 1), (1, 1)]
+        search_num = 9
+        extra_offset = extra_offset.reshape(N, search_num, 2, H, W).permute(0, 1, 3, 4, 2) # [N, search_num, 1, 1, 2]
+        corrs = []
+        for i in range(len(psize_list)):
+            left_feature, right_feature = lefts[i], rights[i]
+            psize, dilate = psize_list[i], dilate_list[i]
+            psizey, psizex = psize[0], psize[1]
+            dilatey, dilatex = dilate[0], dilate[1]
+            ry = psizey // 2 * dilatey
+            rx = psizex // 2 * dilatex
+            x_grid, y_grid = torch.meshgrid(torch.arange(-rx, rx + 1, dilatex, device=self.fmap1.device),
+                                    torch.arange(-ry, ry + 1, dilatey, device=self.fmap1.device), indexing='xy')
+            offsets = torch.stack((x_grid, y_grid))
+            offsets = offsets.reshape(2, -1).permute(1, 0)
+            for d in sorted((0, 2, 3)):
+                offsets = offsets.unsqueeze(d)
+            offsets = offsets.repeat_interleave(N, dim=0)
+            offsets = offsets + extra_offset
+            coords = self.coords + flow  # [N, 2, H, W]
+            coords = coords.permute(0, 2, 3, 1)  # [N, H, W, 2]
+            coords = torch.unsqueeze(coords, 1) + offsets
+            coords = coords.reshape(N, -1, W, 2)  # [N, search_num*H, W, 2]
+            right_feature = bilinear_sampler(
+                right_feature, coords
+            )  # [N, C, search_num*H, W]
+            right_feature = right_feature.reshape(N, C, -1, H, W)  # [N, C, search_num, H, W]
+            left_feature = left_feature.unsqueeze(2).repeat_interleave(right_feature.shape[2], dim=2)
+            corr = torch.mean(left_feature * right_feature, dim=1)
+            corrs.append(corr)
+        final_corr = torch.cat(corrs, dim=1)
+        return final_corr

CREStereo_demo/nets/crestereo.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .update import BasicUpdateBlock
+from .extractor import BasicEncoder
+from .corr import AGCL
+from .attention import PositionEncodingSine, LocalFeatureTransformer
+try:
+    autocast = torch.cuda.amp.autocast
+except:
+    # dummy autocast for PyTorch < 1.6
+    class autocast:
+        def __init__(self, enabled):
+            pass
+        def __enter__(self):
+            pass
+        def __exit__(self, *args):
+            pass
+#Ref: https://github.com/princeton-vl/RAFT/blob/master/core/raft.py
+class CREStereo(nn.Module):
+    def __init__(self, max_disp=192, mixed_precision=False, test_mode=False):
+        super(CREStereo, self).__init__()
+        self.max_flow = max_disp
+        self.mixed_precision = mixed_precision
+        self.test_mode = test_mode
+        self.hidden_dim = 128
+        self.context_dim = 128
+        self.dropout = 0
+        self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=self.dropout)
+        self.update_block = BasicUpdateBlock(hidden_dim=self.hidden_dim, cor_planes=4 * 9, mask_size=4)
+        # loftr
+        self.self_att_fn = LocalFeatureTransformer(
+            d_model=256, nhead=8, layer_names=["self"] * 1, attention="linear"
+        )
+        self.cross_att_fn = LocalFeatureTransformer(
+            d_model=256, nhead=8, layer_names=["cross"] * 1, attention="linear"
+        )
+        # adaptive search
+        self.search_num = 9
+        self.conv_offset_16 = nn.Conv2d(
+            256, self.search_num * 2, kernel_size=3, stride=1, padding=1
+        )
+        self.conv_offset_8 = nn.Conv2d(
+            256, self.search_num * 2, kernel_size=3, stride=1, padding=1
+        )
+        self.range_16 = 1
+        self.range_8 = 1
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+    def convex_upsample(self, flow, mask, rate=4):
+        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
+        N, _, H, W = flow.shape
+        # print(flow.shape, mask.shape, rate)
+        mask = mask.view(N, 1, 9, rate, rate, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_flow = F.unfold(rate * flow, [3,3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, rate*H, rate*W)
+    def zero_init(self, fmap):
+        N, C, H, W = fmap.shape
+        _x = torch.zeros([N, 1, H, W], dtype=torch.float32)
+        _y = torch.zeros([N, 1, H, W], dtype=torch.float32)
+        zero_flow = torch.cat((_x, _y), dim=1).to(fmap.device)
+        return zero_flow
+    def forward(self, image1, image2, flow_init=None, iters=10, upsample=True, test_mode=False):
+        """ Estimate optical flow between pair of frames """
+        image1 = 2 * (image1 / 255.0) - 1.0
+        image2 = 2 * (image2 / 255.0) - 1.0
+        image1 = image1.contiguous()
+        image2 = image2.contiguous()
+        hdim = self.hidden_dim
+        cdim = self.context_dim
+        # run the feature network
+        with autocast(enabled=self.mixed_precision):
+            fmap1, fmap2 = self.fnet([image1, image2])
+        fmap1 = fmap1.float()
+        fmap2 = fmap2.float()
+        with autocast(enabled=self.mixed_precision):
+            # 1/4 -> 1/8
+            # feature
+            fmap1_dw8 = F.avg_pool2d(fmap1, 2, stride=2)
+            fmap2_dw8 = F.avg_pool2d(fmap2, 2, stride=2)
+            # offset
+            offset_dw8 = self.conv_offset_8(fmap1_dw8)
+            offset_dw8 = self.range_8 * (torch.sigmoid(offset_dw8) - 0.5) * 2.0
+            # context
+            net, inp = torch.split(fmap1, [hdim,hdim], dim=1)
+            net = torch.tanh(net)
+            inp = F.relu(inp)
+            net_dw8 = F.avg_pool2d(net, 2, stride=2)
+            inp_dw8 = F.avg_pool2d(inp, 2, stride=2)
+            # 1/4 -> 1/16
+            # feature
+            fmap1_dw16 = F.avg_pool2d(fmap1, 4, stride=4)
+            fmap2_dw16 = F.avg_pool2d(fmap2, 4, stride=4)
+            offset_dw16 = self.conv_offset_16(fmap1_dw16)
+            offset_dw16 = self.range_16 * (torch.sigmoid(offset_dw16) - 0.5) * 2.0
+            # context
+            net_dw16 = F.avg_pool2d(net, 4, stride=4)
+            inp_dw16 = F.avg_pool2d(inp, 4, stride=4)
+            # positional encoding and self-attention
+            pos_encoding_fn_small = PositionEncodingSine(
+                d_model=256, max_shape=(image1.shape[2] // 16, image1.shape[3] // 16)
+            )
+            # 'n c h w -> n (h w) c'
+            x_tmp = pos_encoding_fn_small(fmap1_dw16)
+            fmap1_dw16 = x_tmp.permute(0, 2, 3, 1).reshape(x_tmp.shape[0], x_tmp.shape[2] * x_tmp.shape[3], x_tmp.shape[1])
+            # 'n c h w -> n (h w) c'
+            x_tmp = pos_encoding_fn_small(fmap2_dw16)
+            fmap2_dw16 = x_tmp.permute(0, 2, 3, 1).reshape(x_tmp.shape[0], x_tmp.shape[2] * x_tmp.shape[3], x_tmp.shape[1])
+            fmap1_dw16, fmap2_dw16 = self.self_att_fn(fmap1_dw16, fmap2_dw16)
+            fmap1_dw16, fmap2_dw16 = [
+                x.reshape(x.shape[0], image1.shape[2] // 16, -1, x.shape[2]).permute(0, 3, 1, 2)
+                for x in [fmap1_dw16, fmap2_dw16]
+            ]
+        corr_fn = AGCL(fmap1, fmap2)
+        corr_fn_dw8 = AGCL(fmap1_dw8, fmap2_dw8)
+        corr_fn_att_dw16 = AGCL(fmap1_dw16, fmap2_dw16, att=self.cross_att_fn)
+        # Cascaded refinement (1/16 + 1/8 + 1/4)
+        predictions = []
+        flow = None
+        flow_up = None
+        if flow_init is not None:
+            scale = fmap1.shape[2] / flow_init.shape[2]
+            flow = -scale * F.interpolate(
+                flow_init,
+                size=(fmap1.shape[2], fmap1.shape[3]),
+                mode="bilinear",
+                align_corners=True,
+                )
+        else:
+            # zero initialization
+            flow_dw16 = self.zero_init(fmap1_dw16)
+            # Recurrent Update Module
+            # RUM: 1/16
+            for itr in range(iters // 2):
+                if itr % 2 == 0:
+                    small_patch = False
+                else:
+                    small_patch = True
+                flow_dw16 = flow_dw16.detach()
+                out_corrs = corr_fn_att_dw16(
+                    flow_dw16, offset_dw16, small_patch=small_patch
+                    )
+                with autocast(enabled=self.mixed_precision):
+                    net_dw16, up_mask, delta_flow = self.update_block(
+                        net_dw16, inp_dw16, out_corrs, flow_dw16
+                    )
+                flow_dw16 = flow_dw16 + delta_flow
+                flow = self.convex_upsample(flow_dw16, up_mask, rate=4)
+                flow_up = -4 * F.interpolate(
+                    flow,
+                    size=(4 * flow.shape[2], 4 * flow.shape[3]),
+                    mode="bilinear",
+                    align_corners=True,
+                )
+                predictions.append(flow_up)
+            scale = fmap1_dw8.shape[2] / flow.shape[2]
+            flow_dw8 = -scale * F.interpolate(
+                flow,
+                size=(fmap1_dw8.shape[2], fmap1_dw8.shape[3]),
+                mode="bilinear",
+                align_corners=True,
+            )
+            # RUM: 1/8
+            for itr in range(iters // 2):
+                if itr % 2 == 0:
+                    small_patch = False
+                else:
+                    small_patch = True
+                flow_dw8 = flow_dw8.detach()
+                out_corrs = corr_fn_dw8(flow_dw8, offset_dw8, small_patch=small_patch)
+                with autocast(enabled=self.mixed_precision):
+                    net_dw8, up_mask, delta_flow = self.update_block(
+                        net_dw8, inp_dw8, out_corrs, flow_dw8
+                    )
+                flow_dw8 = flow_dw8 + delta_flow
+                flow = self.convex_upsample(flow_dw8, up_mask, rate=4)
+                flow_up = -2 * F.interpolate(
+                    flow,
+                    size=(2 * flow.shape[2], 2 * flow.shape[3]),
+                    mode="bilinear",
+                    align_corners=True,
+                )
+                predictions.append(flow_up)
+            scale = fmap1.shape[2] / flow.shape[2]
+            flow = -scale * F.interpolate(
+                flow,
+                size=(fmap1.shape[2], fmap1.shape[3]),
+                mode="bilinear",
+                align_corners=True,
+            )
+        # RUM: 1/4
+        for itr in range(iters):
+            if itr % 2 == 0:
+                small_patch = False
+            else:
+                small_patch = True
+            flow = flow.detach()
+            out_corrs = corr_fn(flow, None, small_patch=small_patch, iter_mode=True)
+            with autocast(enabled=self.mixed_precision):
+                net, up_mask, delta_flow = self.update_block(net, inp, out_corrs, flow)
+            flow = flow + delta_flow
+            flow_up = -self.convex_upsample(flow, up_mask, rate=4)
+            predictions.append(flow_up)
+        if self.test_mode:
+            return flow_up
+        return predictions

CREStereo_demo/nets/extractor.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Ref: https://github.com/princeton-vl/RAFT/blob/master/core/extractor.py
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes, affine=False)
+            self.norm2 = nn.InstanceNorm2d(planes, affine=False)
+            self.norm3 = nn.InstanceNorm2d(planes, affine=False)
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+        self.downsample = nn.Sequential(
+            nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        x = self.downsample(x)
+        return self.relu(x+y)
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64, affine=False)
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64,  stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=1)
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, x.shape[0]//2, dim=0)
+        return x

CREStereo_demo/nets/update.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+#Ref: https://github.com/princeton-vl/RAFT/blob/master/core/update.py
+class FlowHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+class SepConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192+128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+        self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+        self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1)))
+        h = (1-z) * h + z * q
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1)))
+        h = (1-z) * h + z * q
+        return h
+class BasicMotionEncoder(nn.Module):
+    def __init__(self, cor_planes):
+        super(BasicMotionEncoder, self).__init__()
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1)
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+class BasicUpdateBlock(nn.Module):
+    def __init__(self, hidden_dim, cor_planes, mask_size=8):
+        super(BasicUpdateBlock, self).__init__()
+        self.encoder = BasicMotionEncoder(cor_planes)
+        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, mask_size**2 *9, 1, padding=0))
+    def forward(self, net, inp, corr, flow, upsample=True):
+        # print(inp.shape, corr.shape, flow.shape)
+        motion_features = self.encoder(flow, corr)
+        # print(motion_features.shape, inp.shape)
+        inp = torch.cat((inp, motion_features), dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+        # scale mask to balence gradients
+        mask = .25 * self.mask(net)
+        return net, mask, delta_flow

CREStereo_demo/nets/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import bilinear_sampler, coords_grid, manual_pad

CREStereo_demo/nets/utils/utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+#Ref: https://github.com/princeton-vl/RAFT/blob/master/core/utils/utils.py
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1,1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1
+    ygrid = 2*ygrid/(H-1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    # img = F.grid_sample(img, grid, align_corners=True)
+    img = bilinear_grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing='ij')
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+def manual_pad(x, pady, padx):
+    pad = (padx, padx, pady, pady)
+    return F.pad(x.clone().detach(), pad, "replicate")
+# Ref: https://zenn.dev/pinto0309/scraps/7d4032067d0160
+def bilinear_grid_sample(im, grid, align_corners=False):
+    """Given an input and a flow-field grid, computes the output using input
+    values and pixel locations from grid. Supported only bilinear interpolation
+    method to sample the input pixels.
+    Args:
+        im (torch.Tensor): Input feature map, shape (N, C, H, W)
+        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
+        align_corners {bool}: If set to True, the extrema (-1 and 1) are
+            considered as referring to the center points of the input’s
+            corner pixels. If set to False, they are instead considered as
+            referring to the corner points of the input’s corner pixels,
+            making the sampling more resolution agnostic.
+    Returns:
+        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
+    """
+    n, c, h, w = im.shape
+    gn, gh, gw, _ = grid.shape
+    assert n == gn
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    if align_corners:
+        x = ((x + 1) / 2) * (w - 1)
+        y = ((y + 1) / 2) * (h - 1)
+    else:
+        x = ((x + 1) * w - 1) / 2
+        y = ((y + 1) * h - 1) / 2
+    x = x.view(n, -1)
+    y = y.view(n, -1)
+    x0 = torch.floor(x).long()
+    y0 = torch.floor(y).long()
+    x1 = x0 + 1
+    y1 = y0 + 1
+    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
+    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
+    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
+    wd = ((x - x0) * (y - y0)).unsqueeze(1)
+    # Apply default for grid_sample function zero padding
+    im_padded = torch.nn.functional.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
+    padded_h = h + 2
+    padded_w = w + 2
+    # save points positions after padding
+    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
+    # Clip coordinates to padded image size
+    x0 = torch.where(x0 < 0, torch.tensor(0, device=im.device), x0)
+    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1, device=im.device), x0)
+    x1 = torch.where(x1 < 0, torch.tensor(0, device=im.device), x1)
+    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1, device=im.device), x1)
+    y0 = torch.where(y0 < 0, torch.tensor(0, device=im.device), y0)
+    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1, device=im.device), y0)
+    y1 = torch.where(y1 < 0, torch.tensor(0, device=im.device), y1)
+    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1, device=im.device), y1)
+    im_padded = im_padded.view(n, c, -1)
+    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    Ia = torch.gather(im_padded, 2, x0_y0)
+    Ib = torch.gather(im_padded, 2, x0_y1)
+    Ic = torch.gather(im_padded, 2, x1_y0)
+    Id = torch.gather(im_padded, 2, x1_y1)
+    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)

FoundationStereo_demo/Utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import os, sys, time, pickle, itertools, datetime, imageio, logging, joblib, importlib, argparse
+# Import torch and related modules only when needed inside functions to avoid CUDA init
+# import torch, torchvision  # Moved to function-level imports
+# import torch.nn.functional as F  # Moved to function-level imports
+# import torch.nn as nn  # Moved to function-level imports
+from functools import partial
+import pandas as pd
+# Import open3d only when needed to avoid CUDA conflicts
+# import open3d as o3d  # Moved to function-level imports
+import cv2
+import numpy as np
+ # Removed transformations import to avoid ModuleNotFoundError
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(code_dir)
+def set_logging_format(level=logging.INFO):
+  importlib.reload(logging)
+  FORMAT = '%(message)s'
+  logging.basicConfig(level=level, format=FORMAT, datefmt='%m-%d|%H:%M:%S')
+# Only call set_logging_format when explicitly needed, not during import
+# set_logging_format()  # Commented out to avoid automatic execution
+def set_seed(random_seed=0):
+  import torch  # Import torch only when function is called
+  import random
+  import numpy as np
+  np.random.seed(random_seed)
+  random.seed(random_seed)
+  torch.manual_seed(random_seed)
+  # Skip CUDA seeding to avoid initialization issues in ZeroGPU
+  # CUDA seeding should be done within @spaces.GPU context
+  try:
+    # Only try CUDA operations if we're already in a CUDA context
+    if hasattr(torch.cuda, '_initialized') and torch.cuda._initialized:
+      if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(random_seed)
+  except (RuntimeError, AttributeError):
+    pass  # CUDA not initialized yet or not available
+  torch.backends.cudnn.deterministic = True
+  torch.backends.cudnn.benchmark = False
+def toOpen3dCloud(points,colors=None,normals=None):
+  import open3d as o3d  # Import only when function is called
+  cloud = o3d.geometry.PointCloud()
+  cloud.points = o3d.utility.Vector3dVector(points.astype(np.float64))
+  if colors is not None:
+    if colors.max()>1:
+      colors = colors/255.0
+    cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float64))
+  if normals is not None:
+    cloud.normals = o3d.utility.Vector3dVector(normals.astype(np.float64))
+  return cloud
+def depth2xyzmap(depth:np.ndarray, K, uvs:np.ndarray=None, zmin=0.1):
+  invalid_mask = (depth<zmin)
+  H,W = depth.shape[:2]
+  if uvs is None:
+    vs,us = np.meshgrid(np.arange(0,H),np.arange(0,W), sparse=False, indexing='ij')
+    vs = vs.reshape(-1)
+    us = us.reshape(-1)
+  else:
+    uvs = uvs.round().astype(int)
+    us = uvs[:,0]
+    vs = uvs[:,1]
+  zs = depth[vs,us]
+  xs = (us-K[0,2])*zs/K[0,0]
+  ys = (vs-K[1,2])*zs/K[1,1]
+  pts = np.stack((xs.reshape(-1),ys.reshape(-1),zs.reshape(-1)), 1)  #(N,3)
+  xyz_map = np.zeros((H,W,3), dtype=np.float32)
+  xyz_map[vs,us] = pts
+  if invalid_mask.any():
+    xyz_map[invalid_mask] = 0
+  return xyz_map
+def freeze_model(model):
+  # This function now works with any model passed to it
+  # No need to import torch at module level
+  model = model.eval()
+  for p in model.parameters():
+    p.requires_grad = False
+  for p in model.buffers():
+    p.requires_grad = False
+  return model
+def get_resize_keep_aspect_ratio(H, W, divider=16, max_H=1232, max_W=1232):
+  assert max_H%divider==0
+  assert max_W%divider==0
+  def round_by_divider(x):
+    return int(np.ceil(x/divider)*divider)
+  H_resize = round_by_divider(H)   #!NOTE KITTI width=1242
+  W_resize = round_by_divider(W)
+  if H_resize>max_H or W_resize>max_W:
+    if H_resize>W_resize:
+      W_resize = round_by_divider(W_resize*max_H/H_resize)
+      H_resize = max_H
+    else:
+      H_resize = round_by_divider(H_resize*max_W/W_resize)
+      W_resize = max_W
+  return int(H_resize), int(W_resize)
+def vis_disparity(disp, min_val=None, max_val=None, invalid_thres=np.inf, color_map=cv2.COLORMAP_TURBO, cmap=None, other_output={}):
+  """
+  @disp: np array (H,W)
+  @invalid_thres: > thres is invalid
+  """
+  disp = disp.copy()
+  H,W = disp.shape[:2]
+  invalid_mask = disp>=invalid_thres
+  if (invalid_mask==0).sum()==0:
+    other_output['min_val'] = None
+    other_output['max_val'] = None
+    return np.zeros((H,W,3))
+  if min_val is None:
+    min_val = disp[invalid_mask==0].min()
+  if max_val is None:
+    max_val = disp[invalid_mask==0].max()
+  other_output['min_val'] = min_val
+  other_output['max_val'] = max_val
+  vis = ((disp-min_val)/(max_val-min_val)).clip(0,1) * 255
+  if cmap is None:
+    vis = cv2.applyColorMap(vis.clip(0, 255).astype(np.uint8), color_map)[...,::-1]
+  else:
+    vis = cmap(vis.astype(np.uint8))[...,:3]*255
+  if invalid_mask.any():
+    vis[invalid_mask] = 0
+  return vis.astype(np.uint8)
+def depth_uint8_decoding(depth_uint8, scale=1000):
+  depth_uint8 = depth_uint8.astype(float)
+  out = depth_uint8[...,0]*255*255 + depth_uint8[...,1]*255 + depth_uint8[...,2]
+  return out/float(scale)

FoundationStereo_demo/app.py ADDED Viewed

	@@ -0,0 +1,1138 @@

+import os
+import sys
+import logging
+import tempfile
+import zipfile
+import gc
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import numpy as np
+import cv2
+import gradio as gr
+import imageio
+# Import spaces BEFORE torch to ensure proper ZeroGPU initialization
+import spaces
+# Import torch after spaces - avoid any CUDA calls during import
+import torch
+# Completely avoid CUDA operations during import phase
+# Do not set default tensor type or modify CUDA settings outside GPU context
+# torch.set_default_tensor_type('torch.FloatTensor')  # Commented out - causes CUDA init
+# Import other safe modules
+from omegaconf import OmegaConf
+from huggingface_hub import hf_hub_download, snapshot_download
+# Do not modify CUDA settings during import - this can trigger CUDA initialization
+# torch.backends.cudnn.enabled = False  # Commented out
+# torch.backends.cudnn.benchmark = False  # Commented out
+# Use current directory as base (gradio_app folder)
+current_dir = os.path.dirname(os.path.abspath(__file__))
+base_dir = current_dir  # gradio_app folder
+# Add current directory to path for local imports
+sys.path.insert(0, current_dir)
+# DO NOT import any local modules here that might use CUDA
+# All local module imports will be done inside GPU-decorated functions
+# Import Open3D with error handling - avoid any CUDA operations
+OPEN3D_AVAILABLE = False  # Will be set properly in GPU context
+try:
+    # Set Open3D to CPU mode to avoid CUDA initialization
+    os.environ['OPEN3D_CPU_RENDERING'] = '1'
+    # Don't import open3d here - do it inside GPU functions
+    # import open3d as o3d
+    OPEN3D_AVAILABLE = True  # Assume available, will check inside GPU context
+except Exception as e:
+    logging.warning(f"Open3D setup failed: {e}")
+    OPEN3D_AVAILABLE = False
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Hugging Face model repository configuration
+HF_REPO_ID = "shriarul5273/FoundationStereo_models"
+MODEL_VARIANTS = {
+    "11-33-40": {
+        "display_name": "FoundationStereo (Low-cost variant - 11-33-40)",
+        "model_file": "pretrained_models/11-33-40/model_best_bp2.pth",
+        "config_file": "pretrained_models/11-33-40/cfg.yaml"
+    },
+    "23-51-11": {
+        "display_name": "FoundationStereo (High-quality variant - 23-51-11)",
+        "model_file": "pretrained_models/23-51-11/model_best_bp2.pth",
+        "config_file": "pretrained_models/23-51-11/cfg.yaml"
+    }
+}
+# Global variables for model caching
+MODEL_PATH: str = None
+CONFIG_PATH: str = None
+# Model cache to avoid reloading when selection doesn't change
+_cached_model = None
+_cached_device = None
+_cached_model_selection = None
+def aggressive_cleanup():
+    """Perform basic cleanup - no CUDA operations outside GPU context"""
+    import gc
+    gc.collect()
+    logging.info("Performed basic memory cleanup")
+@spaces.GPU
+def check_gpu_memory():
+    """Check and log current GPU memory usage - only call within GPU context"""
+    try:
+        allocated = torch.cuda.memory_allocated(0) / 1024**3
+        reserved = torch.cuda.memory_reserved(0) / 1024**3
+        max_allocated = torch.cuda.max_memory_allocated(0) / 1024**3
+        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+        logging.info(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Max: {max_allocated:.2f}GB, Total: {total:.2f}GB")
+        return allocated, reserved, max_allocated, total
+    except RuntimeError as e:
+        logging.warning(f"Failed to get GPU memory info: {e}")
+        return None, None, None, None
+def download_model_from_hf(variant: str, force_download: bool = False) -> Tuple[str, str]:
+    """
+    Download model and config files from Hugging Face Hub
+    Args:
+        variant: Model variant ("11-33-40" or "23-51-11")
+        force_download: Force re-download even if files exist locally
+    Returns:
+        Tuple of (model_path, config_path)
+    """
+    if variant not in MODEL_VARIANTS:
+        raise ValueError(f"Unknown model variant: {variant}. Available: {list(MODEL_VARIANTS.keys())}")
+    variant_info = MODEL_VARIANTS[variant]
+    try:
+        if not force_download:
+            logging.info(f"📦 Checking cache for model variant: {variant}")
+        else:
+            logging.info(f"🔄 Force downloading model variant: {variant}")
+        # Download model file
+        model_path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=variant_info["model_file"],
+            force_download=force_download,
+            local_dir_use_symlinks=False
+        )
+        # Download config file
+        config_path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=variant_info["config_file"],
+            force_download=force_download,
+            local_dir_use_symlinks=False
+        )
+        if force_download:
+            logging.info(f"✅ Successfully downloaded {variant} model files")
+        else:
+            logging.info(f"✅ Successfully loaded {variant} model files from cache")
+        logging.debug(f"Model: {model_path}")
+        logging.debug(f"Config: {config_path}")
+        return model_path, config_path
+    except Exception as e:
+        logging.error(f"Failed to download model {variant}: {e}")
+        raise RuntimeError(f"Failed to download model {variant} from Hugging Face: {e}")
+def get_available_models() -> dict:
+    """Get all available models with their display names and download info"""
+    models = {}
+    # First check local models (legacy support)
+    search_dirs = [
+        os.path.join(current_dir, "pretrained_models"),
+        os.path.join(os.path.dirname(current_dir), "pretrained_models")
+    ]
+    for search_dir in search_dirs:
+        if os.path.exists(search_dir):
+            for model_dir in os.listdir(search_dir):
+                model_path = os.path.join(search_dir, model_dir, "model_best_bp2.pth")
+                cfg_path = os.path.join(search_dir, model_dir, "cfg.yaml")
+                if os.path.exists(model_path) and os.path.exists(cfg_path):
+                    # Create a descriptive name for the model
+                    if model_dir == "11-33-40":
+                        display_name = "FoundationStereo (Low-cost variant - 11-33-40) [Local]"
+                    elif model_dir == "23-51-11":
+                        display_name = "FoundationStereo (High-quality variant - 23-51-11) [Local]"
+                    else:
+                        display_name = f"FoundationStereo ({model_dir}) [Local]"
+                    models[display_name] = {
+                        "model_path": model_path,
+                        "config_path": cfg_path,
+                        "variant": model_dir,
+                        "source": "local"
+                    }
+    # Add Hugging Face models
+    for variant, info in MODEL_VARIANTS.items():
+        display_name = f"{info['display_name']} [Hugging Face]"
+        models[display_name] = {
+            "model_path": None,  # Will be downloaded when needed
+            "config_path": None,  # Will be downloaded when needed
+            "variant": variant,
+            "source": "huggingface"
+        }
+    return models
+def find_model_path() -> Tuple[Optional[str], Optional[str]]:
+    """Find available model and config paths (legacy function for backward compatibility)"""
+    models = get_available_models()
+    if models:
+        # Prefer Hugging Face models over local ones
+        # First try to find HF low-cost variant
+        for display_name in models:
+            if "11-33-40" in display_name and "[Hugging Face]" in display_name:
+                return get_model_paths_from_selection(display_name)
+        # Then try local low-cost variant
+        for display_name in models:
+            if "11-33-40" in display_name:
+                return get_model_paths_from_selection(display_name)
+        # If no low-cost variant, return the first available
+        first_model_name = next(iter(models.keys()))
+        return get_model_paths_from_selection(first_model_name)
+    return None, None
+def get_model_paths_from_selection(model_selection: str) -> Tuple[Optional[str], Optional[str]]:
+    """Get model and config paths from the selected model"""
+    models = get_available_models()
+    # Check if it's in our models dict
+    if model_selection in models:
+        model_info = models[model_selection]
+        # If it's a Hugging Face model, download it first (or get from cache)
+        if model_info["source"] == "huggingface":
+            variant = model_info["variant"]
+            try:
+                logging.info(f"📦 Retrieving {variant} model from cache...")
+                model_path, config_path = download_model_from_hf(variant, force_download=False)
+                return model_path, config_path
+            except Exception as e:
+                logging.error(f"Failed to get model {variant} from cache: {e}")
+                return None, None
+        else:
+            # Local model
+            logging.info(f"📁 Using local model: {model_selection}")
+            return model_info["model_path"], model_info["config_path"]
+    # Handle direct HF model selection (fallback)
+    elif "[Hugging Face]" in model_selection:
+        if "11-33-40" in model_selection:
+            variant = "11-33-40"
+        elif "23-51-11" in model_selection:
+            variant = "23-51-11"
+        else:
+            logging.error(f"Unknown HF model variant in: {model_selection}")
+            return None, None
+        try:
+            logging.info(f"📦 Retrieving {variant} model from cache...")
+            model_path, config_path = download_model_from_hf(variant, force_download=False)
+            return model_path, config_path
+        except Exception as e:
+            logging.error(f"Failed to get model {variant} from cache: {e}")
+            return None, None
+    return None, None
+def get_cached_model(model_selection: str):
+    """Get cached model or load new one if selection changed"""
+    global _cached_model, _cached_device, _cached_model_selection
+    # Get model paths from selection
+    model_path, config_path = get_model_paths_from_selection(model_selection)
+    if model_path is None or config_path is None:
+        raise ValueError(f"Selected model not found: {model_selection}")
+    # Load model fresh for each inference (ZeroGPU optimized)
+    # Since models are pre-downloaded, this should be fast
+    logging.info(f"🚀 Loading cached model: {model_selection}")
+    model, device = load_model_for_inference(model_path, config_path)
+    logging.info(f"✅ Model loaded successfully from cache: {model_selection}")
+    return model, device
+def clear_model_cache():
+    """Clear the cached model to free memory"""
+    global _cached_model, _cached_device, _cached_model_selection
+    if _cached_model is not None:
+        logging.info("Clearing model cache...")
+        del _cached_model
+        _cached_model = None
+        _cached_device = None
+        _cached_model_selection = None
+        # Simple cleanup
+        import gc
+        gc.collect()
+        logging.info("Model cache cleared")
+    else:
+        logging.info("No model in cache to clear")
+@spaces.GPU
+def load_model_for_inference(model_path: str, cfg_path: str):
+    """Load model temporarily for inference (demo-style)"""
+    # Set CUDA settings safely within GPU context
+    torch.set_default_tensor_type('torch.cuda.FloatTensor')  # Now safe to use CUDA tensors
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    # Import these inside the function to avoid early CUDA initialization
+    try:
+        # Import selectively to avoid CUDA calls in Utils
+        from core.foundation_stereo import FoundationStereo
+        from omegaconf import OmegaConf
+        logging.info("Successfully imported required modules")
+        # Import set_logging_format safely
+        from Utils import set_logging_format
+        set_logging_format()
+        # Manual seed setting to avoid CUDA calls in Utils.set_seed
+        import random
+        random_seed = 0
+        np.random.seed(random_seed)
+        random.seed(random_seed)
+        torch.manual_seed(random_seed)
+        # CUDA seeding will be done after device is available
+        logging.info("Set logging format and seed")
+    except Exception as e:
+        logging.error(f"Failed to import modules: {e}")
+        raise RuntimeError(f"Import failed: {e}")
+    # Check if CUDA is available after ZeroGPU initialization
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available. ZeroGPU initialization may have failed.")
+    # Use the first available CUDA device
+    device = torch.device("cuda")
+    # Now set CUDA seed safely within GPU context
+    try:
+        torch.cuda.manual_seed_all(random_seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    except Exception as e:
+        logging.warning(f"Could not set CUDA seed: {e}")
+    try:
+        # Load config
+        cfg = OmegaConf.load(cfg_path)
+        cfg.setdefault("vit_size", "vitl")
+        logging.info("Loaded config file")
+        # Create model
+        model = FoundationStereo(cfg).to(device)
+        model.eval()
+        logging.info("Created model")
+        # Load checkpoint
+        ckpt = torch.load(model_path, map_location=device)
+        model.load_state_dict(ckpt["model"], strict=True)
+        logging.info("Loaded model weights")
+        # Memory optimizations
+        torch.set_grad_enabled(False)
+        model.half()  # Use half precision
+        logging.info("Applied memory optimizations")
+        return model, device
+    except Exception as e:
+        logging.error(f"Model loading failed: {e}")
+        raise RuntimeError(f"Failed to load model: {e}")
+# Fixed with static duration
+@spaces.GPU(duration=60)  # Static 60 seconds for basic processing
+def process_stereo_pair(model_selection: str, left_image: np.ndarray, right_image: np.ndarray,
+                       progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], str]:
+    """
+    Main processing function for stereo pair (with model caching)
+    """
+    logging.info("Starting stereo pair processing...")
+    if left_image is None or right_image is None:
+        return None, "❌ Please upload both left and right images."
+    try:
+        # Import these inside to avoid early CUDA calls
+        logging.info("Importing required modules...")
+        from core.utils.utils import InputPadder
+        # Import vis_disparity safely - it shouldn't have CUDA calls but be careful
+        from Utils import vis_disparity
+        logging.info("✅ Successfully imported processing modules")
+        # Get cached model (will load if not cached or selection changed)
+        variant_name = model_selection.split('(')[1].split(')')[0] if '(' in model_selection else model_selection
+        progress(0.1, desc=f"Loading cached model ({variant_name})...")
+        logging.info("🚀 Getting cached model...")
+        model, device = get_cached_model(model_selection)
+        logging.info("✅ Cached model loaded successfully")
+        progress(0.2, desc="Preprocessing images...")
+        # Validate input images
+        if left_image.shape != right_image.shape:
+            return None, "❌ Left and right images must have the same dimensions."
+        H, W = left_image.shape[:2]
+        # Convert to torch tensors and ensure they are contiguous
+        img0 = torch.as_tensor(left_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        img1 = torch.as_tensor(right_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        # Pad images and ensure contiguity
+        padder = InputPadder(img0.shape, divis_by=32, force_square=False)
+        img0, img1 = padder.pad(img0, img1)
+        # Ensure padded tensors are contiguous
+        img0 = img0.contiguous()
+        img1 = img1.contiguous()
+        progress(0.5, desc="Running inference...")
+        # Process stereo pair with autocast and ensure clean memory state
+        torch.cuda.empty_cache()  # Clear any cached memory before inference
+        try:
+            with torch.amp.autocast("cuda", enabled=True):
+                # Ensure tensors are in the right format for cuDNN
+                if not img0.is_contiguous():
+                    img0 = img0.contiguous()
+                if not img1.is_contiguous():
+                    img1 = img1.contiguous()
+                disp = model.forward(img0, img1, iters=32, test_mode=True)
+        except RuntimeError as e:
+            if "cuDNN" in str(e):
+                # Fallback: disable cuDNN optimizations and retry
+                logging.warning(f"cuDNN error encountered, retrying with fallback: {e}")
+                torch.backends.cudnn.enabled = False
+                try:
+                    with torch.amp.autocast("cuda", enabled=True):
+                        disp = model.forward(img0, img1, iters=32, test_mode=True)
+                finally:
+                    torch.backends.cudnn.enabled = True  # Re-enable for future use
+            else:
+                raise e
+        # Unpad and convert to numpy
+        disp = padder.unpad(disp.float())
+        disp_cpu = disp.data.cpu().numpy().reshape(H, W)
+        progress(0.8, desc="Creating visualization...")
+        # Create visualization - ONLY disparity
+        disparity_vis = vis_disparity(disp_cpu)
+        result_image = disparity_vis
+        progress(1.0, desc="Complete!")
+        # Clean up intermediate tensors
+        del img0, img1, disp
+        # For ZeroGPU: Clean up model after inference
+        del model
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Create status message
+        valid_mask = disp_cpu != np.inf
+        min_disp = disp_cpu[valid_mask].min() if valid_mask.any() else 0
+        max_disp = disp_cpu[valid_mask].max() if valid_mask.any() else 0
+        mean_disp = disp_cpu[valid_mask].mean() if valid_mask.any() else 0
+        # Get model variant for status
+        variant = model_selection.split('(')[1].split(')')[0] if '(' in model_selection else "Unknown"
+        # Check current memory usage (safely within GPU context)
+        current_memory = torch.cuda.memory_allocated(0) / 1024**3
+        max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+        memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        status = f"""✅ Processing successful!
+🔧 Model: {variant} (ZeroGPU){memory_info}
+📊 Disparity Statistics:
+   • Range: {min_disp:.2f} - {max_disp:.2f}
+   • Mean: {mean_disp:.2f}
+   • Input size: {W}×{H}
+   • Valid pixels: {valid_mask.sum()}/{valid_mask.size}"""
+        return result_image, status
+    except Exception as e:
+        logging.error(f"Processing failed: {e}")
+        # Cleanup on error
+        if 'img0' in locals():
+            del img0
+        if 'img1' in locals():
+            del img1
+        if 'disp' in locals():
+            del disp
+        if 'model' in locals():
+            del model
+        # Clean up GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, f"❌ Error: {str(e)}"
+# Fixed with static duration
+@spaces.GPU(duration=120)  # Static 120 seconds for depth processing
+def process_with_depth(model_selection: str, left_image: np.ndarray, right_image: np.ndarray,
+                      camera_matrix: str, baseline: float,
+                      progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], Optional[str], Optional[str], str]:
+    """
+    Process stereo pair and generate depth map and point cloud (with model caching)
+    """
+    # Import these inside to avoid early CUDA calls
+    from core.utils.utils import InputPadder
+    # Import vis_disparity safely within GPU context
+    from Utils import vis_disparity
+    # Import Open3D inside GPU context
+    global OPEN3D_AVAILABLE
+    try:
+        import open3d as o3d
+        OPEN3D_AVAILABLE = True
+    except ImportError as e:
+        logging.warning(f"Open3D not available: {e}")
+        OPEN3D_AVAILABLE = False
+        return None, None, None, "❌ Open3D not available. Point cloud generation disabled."
+    if left_image is None or right_image is None:
+        return None, None, None, "❌ Please upload both left and right images."
+    try:
+        progress(0.1, desc="Parsing camera parameters...")
+        # Parse camera matrix
+        try:
+            K_values = list(map(float, camera_matrix.strip().split()))
+            if len(K_values) != 9:
+                return None, None, None, "❌ Camera matrix must contain exactly 9 values."
+            K = np.array(K_values).reshape(3, 3)
+        except ValueError:
+            return None, None, None, "❌ Invalid camera matrix format. Use space-separated numbers."
+        if baseline <= 0:
+            return None, None, None, "❌ Baseline must be positive."
+        variant = model_selection.split('(')[1].split(')')[0] if '(' in model_selection else "Unknown"
+        progress(0.2, desc=f"Loading cached model ({variant})...")
+        # Get cached model (will load if not cached or selection changed)
+        model, device = get_cached_model(model_selection)
+        progress(0.4, desc="Running stereo inference...")
+        # Get disparity using the same process as the basic function
+        H, W = left_image.shape[:2]
+        img0 = torch.as_tensor(left_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        img1 = torch.as_tensor(right_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        padder = InputPadder(img0.shape, divis_by=32, force_square=False)
+        img0, img1 = padder.pad(img0, img1)
+        # Ensure padded tensors are contiguous
+        img0 = img0.contiguous()
+        img1 = img1.contiguous()
+        # Clear cache and ensure clean memory state before inference
+        torch.cuda.empty_cache()
+        try:
+            with torch.amp.autocast("cuda", enabled=True):
+                # Double-check tensor contiguity before cuDNN operations
+                if not img0.is_contiguous():
+                    img0 = img0.contiguous()
+                if not img1.is_contiguous():
+                    img1 = img1.contiguous()
+                disp = model.forward(img0, img1, iters=32, test_mode=True)
+        except RuntimeError as e:
+            if "cuDNN" in str(e):
+                # Fallback: disable cuDNN optimizations and retry
+                logging.warning(f"cuDNN error encountered in depth processing, retrying with fallback: {e}")
+                torch.backends.cudnn.enabled = False
+                try:
+                    with torch.amp.autocast("cuda", enabled=True):
+                        disp = model.forward(img0, img1, iters=32, test_mode=True)
+                finally:
+                    torch.backends.cudnn.enabled = True  # Re-enable for future use
+            else:
+                raise e
+        disp = padder.unpad(disp.float())
+        disp_cpu = disp.data.cpu().numpy().reshape(H, W)
+        # Clean up intermediate tensors early
+        del img0, img1, disp
+        # For ZeroGPU: Keep model reference for rest of processing
+        torch.cuda.empty_cache()
+        progress(0.6, desc="Converting to depth...")
+        # Remove invisible points (same as in original demo)
+        yy, xx = np.meshgrid(np.arange(disp_cpu.shape[0]), np.arange(disp_cpu.shape[1]), indexing='ij')
+        us_right = xx - disp_cpu
+        invalid = us_right < 0
+        disp_cpu[invalid] = np.inf
+        # Convert to depth using the formula from the original demo
+        depth = K[0, 0] * baseline / disp_cpu
+        # Visualize depth (no rotation)
+        depth_vis = vis_disparity(depth, max_val=10.0)
+        progress(0.8, desc="Generating point cloud...")
+        # Generate point cloud with proper coordinate transformation
+        fx, fy = K[0, 0], K[1, 1]
+        cx, cy = K[0, 2], K[1, 2]
+        # Create coordinate meshgrids
+        u, v = np.meshgrid(np.arange(W), np.arange(H))
+        # Convert to 3D coordinates (proper camera coordinate system)
+        valid_depth = depth != np.inf
+        z = depth[valid_depth]  # Z coordinate (depth)
+        x = (u[valid_depth] - cx) * z / fx  # X coordinate
+        y = (v[valid_depth] - cy) * z / fy  # Y coordinate
+        # Stack coordinates (X, Y, Z)
+        points = np.stack([x, y, z], axis=-1)
+        # Get corresponding colors
+        colors = left_image[valid_depth]
+        # Filter points by depth range
+        depth_mask = (z > 0) & (z <= 10.0)
+        valid_points = points[depth_mask]
+        valid_colors = colors[depth_mask]
+        if len(valid_points) == 0:
+            return depth_vis, None, None, "⚠️ No valid points generated for point cloud."
+        # Subsample points for better 3D visualization performance
+        if len(valid_points) > 100000:
+            indices = np.random.choice(len(valid_points), 100000, replace=False)
+            valid_points = valid_points[indices]
+            valid_colors = valid_colors[indices]
+        # Transform coordinates for proper visualization orientation
+        # Standard computer vision: X right, Y down, Z forward
+        # For better 3D viewing: X right, Y up, Z backward
+        transformed_points = valid_points.copy()
+        transformed_points[:, 1] = -transformed_points[:, 1]  # Flip Y axis
+        transformed_points[:, 2] = -transformed_points[:, 2]  # Flip Z axis
+        # Create point cloud using transformed coordinates
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(transformed_points)
+        pcd.colors = o3d.utility.Vector3dVector(valid_colors / 255.0)
+        # Save point cloud for download (.ply)
+        temp_ply_file = tempfile.NamedTemporaryFile(delete=False, suffix='.ply')
+        o3d.io.write_point_cloud(temp_ply_file.name, pcd)
+        # Create OBJ file for 3D visualization (better Gradio compatibility)
+        temp_obj_file = tempfile.NamedTemporaryFile(delete=False, suffix='.obj')
+        # Write OBJ file with proper vertex colors
+        with open(temp_obj_file.name, 'w') as f:
+            f.write("# Point cloud generated from stereo depth\n")
+            f.write(f"# Total points: {len(valid_points)}\n")
+            # Write vertices with RGB colors (0-1 range)
+            for i, (point, color) in enumerate(zip(transformed_points, valid_colors)):
+                # Ensure colors are in 0-1 range
+                r, g, b = np.clip(color / 255.0, 0, 1)
+                f.write(f"v {point[0]:.6f} {point[1]:.6f} {point[2]:.6f} {r:.6f} {g:.6f} {b:.6f}\n")
+        progress(1.0, desc="Complete!")
+        # For ZeroGPU: Clean up model after inference
+        del model
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Check current memory usage (safely within GPU context)
+        current_memory = torch.cuda.memory_allocated(0) / 1024**3
+        max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+        memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        status = f"""✅ Depth processing successful!
+🔧 Model: {variant} (ZeroGPU){memory_info}
+📊 Statistics:
+   • Valid points: {len(valid_points):,}
+   • Depth range: {z.min():.2f} - {z.max():.2f} m
+   • Baseline: {baseline} m
+   • Point cloud saved with {len(valid_points)} points
+   • 3D visualization ready (corrected orientation)"""
+        return depth_vis, temp_ply_file.name, temp_obj_file.name, status
+    except Exception as e:
+        logging.error(f"Depth processing failed: {e}")
+        # Cleanup on error
+        if 'img0' in locals():
+            del img0
+        if 'img1' in locals():
+            del img1
+        if 'disp' in locals():
+            del disp
+        if 'model' in locals():
+            del model
+        # Clean up GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, None, None, f"❌ Error: {str(e)}"
+def preload_all_models():
+    """Pre-download all Hugging Face models to cache during startup"""
+    logging.info("🔄 Pre-downloading all models to cache...")
+    downloaded_models = {}
+    for variant, info in MODEL_VARIANTS.items():
+        try:
+            logging.info(f"📥 Downloading {variant} model to cache...")
+            model_path, config_path = download_model_from_hf(variant, force_download=False)
+            downloaded_models[variant] = {
+                "model_path": model_path,
+                "config_path": config_path,
+                "display_name": info["display_name"]
+            }
+            logging.info(f"✅ {variant} model cached successfully")
+        except Exception as e:
+            logging.warning(f"⚠️ Failed to download {variant} model: {e}")
+            # Continue with other models even if one fails
+    logging.info(f"✅ Model pre-loading complete. {len(downloaded_models)}/{len(MODEL_VARIANTS)} models cached.")
+    return downloaded_models
+def create_app() -> gr.Blocks:
+    """Create the Gradio application"""
+    global MODEL_PATH, CONFIG_PATH
+    # Debug: Print current directory and check for files
+    print(f"Current directory: {current_dir}")
+    print(f"Python working directory: {os.getcwd()}")
+    # Pre-download all models to cache
+    try:
+        cached_models = preload_all_models()
+        logging.info(f"Pre-loaded {len(cached_models)} models to cache")
+    except Exception as e:
+        logging.error(f"Failed to pre-load models: {e}")
+        cached_models = {}
+    # Get available models (this should be safe as it only does file system operations)
+    try:
+        available_models = get_available_models()
+        logging.info(f"Successfully got available models: {len(available_models)} found")
+    except Exception as e:
+        logging.error(f"Failed to get available models: {e}")
+        available_models = {}
+    # Find model and config paths (legacy) - should be safe as well
+    try:
+        MODEL_PATH, CONFIG_PATH = find_model_path()
+        logging.info("Successfully found model paths")
+    except Exception as e:
+        logging.error(f"Failed to find model paths: {e}")
+        MODEL_PATH, CONFIG_PATH = None, None
+    with gr.Blocks(
+        title="FoundationStereo - Stereo Depth Estimation",
+        theme=gr.themes.Soft(),
+        css="footer {visibility: hidden}",
+        delete_cache=(60, 60)  # Delete cache after 60 seconds for ZeroGPU
+    ) as app:
+        gr.Markdown("""
+        # 🔍 FoundationStereo: Zero-Shot Stereo Matching
+        Upload a pair of **rectified** stereo images to get disparity estimation.
+        ⚠️ **Important**: Images should be rectified (epipolar lines are horizontal) and undistorted.
+        ⚡ **ZeroGPU Powered**: Runs on high-performance A100 GPUs for fast inference.
+        📦 **Smart Caching**: All models are pre-downloaded for instant model switching.
+        """)
+        # Instructions section
+        with gr.Accordion("📋 Instructions to Run This Repository", open=False):
+            gr.Markdown("""
+            ## 🚀 How to Run This Demo
+            This is a **demo application** showcasing the FoundationStereo model for stereo matching estimation.
+            ### 🖼️ Input Requirements
+            1. **Image Format**: Upload images in JPEG or PNG format.
+            2. **Image Size**: Images should be of the same size and resolution.
+            3. **Rectification**: Ensure images are rectified (epipolar lines are horizontal) and undistorted.
+            4. **Camera Parameters**: For advanced processing, provide camera parameters (camera matrix and baseline).
+            ### 📊 Using the Demo
+            1. **Select Model**: Choose between low-cost (11-33-40) or high-quality (23-51-11) variants
+            2. **Upload Images**: Provide rectified stereo image pairs
+            3. **Basic Processing**: Get disparity visualization
+            4. **Advanced Processing**: Generate depth maps and 3D point clouds (requires camera parameters)
+            ###  Original Work
+            This demo is based on the original FoundationStereo research. Please visit the official resources:
+            - **Paper**: [FoundationStereo: Zero-Shot Stereo Matching via Foundation Model](https://arxiv.org/abs/2501.09898)
+            - **Project Page**: [https://nvlabs.github.io/FoundationStereo/](https://nvlabs.github.io/FoundationStereo/)
+            - **Official Repository**: [https://github.com/NVlabs/FoundationStereo](https://github.com/NVlabs/FoundationStereo)
+            **⚠️ Demo Notice**: This is a demonstration interface. For research and production use, please refer to the original repository and follow the official implementation guidelines.
+            """)
+        # Model selection
+        with gr.Row():
+            # Always include Hugging Face models in the choices
+            all_choices = list(available_models.keys())
+            # If no models found, add the HF models manually
+            if not all_choices:
+                all_choices = [
+                    "FoundationStereo (Low-cost variant - 11-33-40) [Hugging Face]",
+                    "FoundationStereo (High-quality variant - 23-51-11) [Hugging Face]"
+                ]
+            # Get default model (prefer Hugging Face low-cost variant)
+            default_model = None
+            # First try Hugging Face low-cost variant
+            for name in all_choices:
+                if "11-33-40" in name and "[Hugging Face]" in name:
+                    default_model = name
+                    break
+            # If no HF low-cost variant, try any low-cost variant
+            if default_model is None:
+                for name in all_choices:
+                    if "11-33-40" in name:
+                        default_model = name
+                        break
+            # If no low-cost variant, use first available
+            if default_model is None:
+                default_model = all_choices[0] if all_choices else None
+            model_selector = gr.Dropdown(
+                choices=all_choices,
+                value=default_model,
+                label="🎯 Select Model",
+                info="Choose the FoundationStereo model variant. Hugging Face models download automatically.",
+                interactive=True
+            )
+        with gr.Tabs():
+            # Basic stereo processing tab
+            with gr.TabItem("🖼️ Basic Stereo Processing"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input = gr.Image(
+                            label="📷 Left Image",
+                            type="numpy",
+                            height=300
+                        )
+                        right_input = gr.Image(
+                            label="📷 Right Image",
+                            type="numpy",
+                            height=300
+                        )
+                        process_btn = gr.Button(
+                            "🚀 Process Stereo Pair",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        output_image = gr.Image(
+                            label="📊 Disparity Visualization",
+                            height=400
+                        )
+                        status_text = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=8
+                        )
+                # Example images
+                examples_list = []
+                # Example 1
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png")
+                    ])
+                # Example 2
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png")
+                    ])
+                gr.Examples(
+                    examples=examples_list,
+                    inputs=[left_input, right_input],
+                    label="📋 Example Images"
+                )
+            # Advanced processing with depth
+            with gr.TabItem("📐 Advanced Processing (Depth & Point Cloud)"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input_adv = gr.Image(
+                            label="📷 Left Image",
+                            type="numpy",
+                            height=250
+                        )
+                        right_input_adv = gr.Image(
+                            label="📷 Right Image",
+                            type="numpy",
+                            height=250
+                        )
+                        # Camera parameters
+                        with gr.Group():
+                            gr.Markdown("### 📹 Camera Parameters")
+                            camera_matrix_input = gr.Textbox(
+                                label="Camera Matrix (9 values: fx 0 cx 0 fy cy 0 0 1)",
+                                value="",
+                            )
+                            baseline_input = gr.Number(
+                                label="Baseline (meters)",
+                                value=None,
+                                minimum=0.001,
+                                maximum=10.0,
+                                step=0.001
+                            )
+                        process_depth_btn = gr.Button(
+                            "🔬 Process with Depth",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        depth_output = gr.Image(
+                            label="📏 Depth Visualization",
+                            height=300
+                        )
+                        pointcloud_output = gr.File(
+                            label="☁️ Point Cloud Download (.ply)",
+                            file_types=[".ply"]
+                        )
+                        status_depth = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=6
+                        )
+                # 3D Point Cloud Visualization
+                with gr.Row():
+                    pointcloud_3d = gr.Model3D(
+                        label="🌐 3D Point Cloud Viewer",
+                        clear_color=[0.0, 0.0, 0.0, 0.0],
+                        height=400
+                    )
+                # Example images for advanced processing
+                examples_advanced_list = []
+                # Example 1 - Camera parameters from K.txt
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png"),
+                        "754.6680908203125 0.0 489.3794860839844 0.0 754.6680908203125 265.16162109375 0.0 0.0 1.0",  # Camera matrix
+                        0.063  # Baseline in meters
+                    ])
+                # Example 2 - Camera parameters from K.txt
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png"),
+                        "1733.74 0.0 792.27 0.0 1733.74 541.89 0.0 0.0 1.0",  # Camera matrix
+                        0.537  # Baseline in meters (converted from 536.62mm)
+                    ])
+                gr.Examples(
+                    examples=examples_advanced_list,
+                    inputs=[left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                    label="📋 Example Images with Camera Parameters"
+                )
+        # Event handlers - Always enable since we have HF models
+        process_btn.click(
+            fn=process_stereo_pair,
+            inputs=[model_selector, left_input, right_input],
+            outputs=[output_image, status_text],
+            show_progress=True
+        )
+        if OPEN3D_AVAILABLE:
+            process_depth_btn.click(
+                fn=process_with_depth,
+                inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth],
+                show_progress=True
+            )
+        else:
+            process_depth_btn.click(
+                fn=lambda *args: (None, None, None, "❌ Open3D not available. Install with: pip install open3d"),
+                inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth]
+            )
+        # Citation section at the bottom
+        with gr.Accordion("📖 Citation", open=False):
+            gr.Markdown("""
+            ### 📄 Please Cite the Original Paper
+            If you use this work in your research, please cite:
+            ```bibtex
+            @article{wen2025stereo,
+              title={FoundationStereo: Zero-Shot Stereo Matching},
+              author={Bowen Wen and Matthew Trepte and Joseph Aribido and Jan Kautz and Orazio Gallo and Stan Birchfield},
+              journal={CVPR},
+              year={2025}
+            }
+            ```
+            """)
+        # Footer
+        gr.Markdown(f"""
+        ---
+        ### 📝 Notes:
+        - **Input images must be rectified stereo pairs** (epipolar lines are horizontal)
+        - **🤗 Hugging Face Integration**: Models are automatically downloaded from `{HF_REPO_ID}`
+        - **📦 Smart Caching**: All models are pre-downloaded and cached for instant switching
+        - **⚡ ZeroGPU Acceleration**: Powered by high-performance A100 GPUs
+        - For best results, use PNG images without lossy compression
+        - Model works on RGB images but also supports monochrome/IR stereo pairs
+        - **Optimized for Spaces**: Memory-efficient inference on shared infrastructure
+        ### 🔗 References:
+        - [FoundationStereo Paper](https://arxiv.org/abs/2501.09898)
+        - [Project Website](https://nvlabs.github.io/FoundationStereo/)
+        - [GitHub Repository](https://github.com/NVlabs/FoundationStereo)
+        - [Hugging Face Models]({f"https://huggingface.co/{HF_REPO_ID}"})
+        """)
+    return app
+def main():
+    """Main function to launch the app"""
+    # Ensure no CUDA operations during startup
+    if torch.cuda.is_available():
+        logging.warning("CUDA detected during startup - this should not happen in ZeroGPU")
+    logging.info("🚀 Starting FoundationStereo Gradio App...")
+    # Parse command line arguments
+    import argparse
+    parser = argparse.ArgumentParser(description="FoundationStereo Gradio App")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
+    parser.add_argument("--share", action="store_true", help="Create shareable link")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    try:
+        # Create and launch app
+        logging.info("Creating Gradio app...")
+        app = create_app()
+        logging.info("✅ Gradio app created successfully")
+        logging.info(f"Launching app on {args.host}:{args.port}")
+        if args.share:
+            logging.info("Share link will be created")
+        # For ZeroGPU compatibility, launch with appropriate settings
+        app.launch(
+            server_name=args.host,
+            server_port=args.port,
+            share=args.share,
+            show_error=True,
+            favicon_path=None,
+            ssr_mode=False,  # Disable SSR for ZeroGPU compatibility
+            allowed_paths=["./"]  # Allow access to local files
+        )
+    except Exception as e:
+        logging.error(f"Failed to launch app: {e}")
+        raise
+if __name__ == "__main__":
+    # Additional safety check for ZeroGPU environment
+    if 'SPACE_ID' in os.environ:
+        logging.info("Running in Hugging Face Spaces environment")
+    # Do not check CUDA status during startup - this can trigger CUDA initialization
+    # The CUDA status will be checked inside the @spaces.GPU decorated functions
+    logging.info("✅ CUDA status will be checked within GPU-decorated functions")
+    main()

FoundationStereo_demo/app_local.py ADDED Viewed

	@@ -0,0 +1,1292 @@

+import os
+import sys
+import logging
+import tempfile
+import zipfile
+import gc
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import numpy as np
+import cv2
+import gradio as gr
+import imageio
+import torch
+# Set default tensor type if needed
+# torch.set_default_tensor_type('torch.FloatTensor')
+# Import other safe modules
+from omegaconf import OmegaConf
+from huggingface_hub import hf_hub_download, snapshot_download
+# CUDA backend settings
+# torch.backends.cudnn.enabled = False
+# torch.backends.cudnn.benchmark = False
+# Use current directory as base (gradio_app folder)
+current_dir = os.path.dirname(os.path.abspath(__file__))
+base_dir = current_dir  # gradio_app folder
+# Add current directory to path for local imports
+sys.path.insert(0, current_dir)
+# DO NOT import any local modules here that might use CUDA
+# All local module imports will be done inside functions
+# Import Open3D with error handling
+OPEN3D_AVAILABLE = False
+try:
+    # Set Open3D to CPU mode to avoid CUDA initialization
+    os.environ['OPEN3D_CPU_RENDERING'] = '1'
+    # Don't import open3d here - do it inside functions
+    # import open3d as o3d
+    OPEN3D_AVAILABLE = True  # Assume available, will check later
+except Exception as e:
+    logging.warning(f"Open3D setup failed: {e}")
+    OPEN3D_AVAILABLE = False
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Hugging Face model repository configuration
+HF_REPO_ID = "shriarul5273/FoundationStereo_models"
+MODEL_VARIANTS = {
+    "11-33-40": {
+        "display_name": "FoundationStereo (Low-cost variant - 11-33-40)",
+        "model_file": "pretrained_models/11-33-40/model_best_bp2.pth",
+        "config_file": "pretrained_models/11-33-40/cfg.yaml"
+    },
+    "23-51-11": {
+        "display_name": "FoundationStereo (High-quality variant - 23-51-11)",
+        "model_file": "pretrained_models/23-51-11/model_best_bp2.pth",
+        "config_file": "pretrained_models/23-51-11/cfg.yaml"
+    }
+}
+# Global variables for model caching
+MODEL_PATH: str = None
+CONFIG_PATH: str = None
+# Model cache to avoid reloading when selection doesn't change
+_cached_model = None
+_cached_device = None
+_cached_model_selection = None
+def aggressive_cleanup():
+    """Perform basic cleanup"""
+    import gc
+    gc.collect()
+    logging.info("Performed basic memory cleanup")
+def check_gpu_memory():
+    """Check and log current GPU memory usage"""
+    try:
+        allocated = torch.cuda.memory_allocated(0) / 1024**3
+        reserved = torch.cuda.memory_reserved(0) / 1024**3
+        max_allocated = torch.cuda.max_memory_allocated(0) / 1024**3
+        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+        logging.info(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Max: {max_allocated:.2f}GB, Total: {total:.2f}GB")
+        return allocated, reserved, max_allocated, total
+    except RuntimeError as e:
+        logging.warning(f"Failed to get GPU memory info: {e}")
+        return None, None, None, None
+def download_model_from_hf(variant: str, force_download: bool = False) -> Tuple[str, str]:
+    """
+    Download model and config files from Hugging Face Hub
+    Args:
+        variant: Model variant ("11-33-40" or "23-51-11")
+        force_download: Force re-download even if files exist locally
+    Returns:
+        Tuple of (model_path, config_path)
+    """
+    if variant not in MODEL_VARIANTS:
+        raise ValueError(f"Unknown model variant: {variant}. Available: {list(MODEL_VARIANTS.keys())}")
+    variant_info = MODEL_VARIANTS[variant]
+    try:
+        if not force_download:
+            logging.info(f"📦 Checking cache for model variant: {variant}")
+        else:
+            logging.info(f"🔄 Force downloading model variant: {variant}")
+        # Download model file
+        model_path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=variant_info["model_file"],
+            force_download=force_download,
+            local_dir_use_symlinks=False
+        )
+        # Download config file
+        config_path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=variant_info["config_file"],
+            force_download=force_download,
+            local_dir_use_symlinks=False
+        )
+        if force_download:
+            logging.info(f"✅ Successfully downloaded {variant} model files")
+        else:
+            logging.info(f"✅ Successfully loaded {variant} model files from cache")
+        logging.debug(f"Model: {model_path}")
+        logging.debug(f"Config: {config_path}")
+        return model_path, config_path
+    except Exception as e:
+        logging.error(f"Failed to download model {variant}: {e}")
+        raise RuntimeError(f"Failed to download model {variant} from Hugging Face: {e}")
+def get_available_models() -> dict:
+    """Get all available models with their display names and download info"""
+    models = {}
+    # First check local models (legacy support)
+    search_dirs = [
+        os.path.join(current_dir, "pretrained_models"),
+        os.path.join(os.path.dirname(current_dir), "pretrained_models")
+    ]
+    for search_dir in search_dirs:
+        if os.path.exists(search_dir):
+            for model_dir in os.listdir(search_dir):
+                model_path = os.path.join(search_dir, model_dir, "model_best_bp2.pth")
+                cfg_path = os.path.join(search_dir, model_dir, "cfg.yaml")
+                if os.path.exists(model_path) and os.path.exists(cfg_path):
+                    # Create a descriptive name for the model
+                    if model_dir == "11-33-40":
+                        display_name = "FoundationStereo (Low-cost variant - 11-33-40) [Local]"
+                    elif model_dir == "23-51-11":
+                        display_name = "FoundationStereo (High-quality variant - 23-51-11) [Local]"
+                    else:
+                        display_name = f"FoundationStereo ({model_dir}) [Local]"
+                    models[display_name] = {
+                        "model_path": model_path,
+                        "config_path": cfg_path,
+                        "variant": model_dir,
+                        "source": "local"
+                    }
+    # Add Hugging Face models
+    for variant, info in MODEL_VARIANTS.items():
+        display_name = f"{info['display_name']} [Hugging Face]"
+        models[display_name] = {
+            "model_path": None,  # Will be downloaded when needed
+            "config_path": None,  # Will be downloaded when needed
+            "variant": variant,
+            "source": "huggingface"
+        }
+    return models
+def find_model_path() -> Tuple[Optional[str], Optional[str]]:
+    """Find available model and config paths (legacy function for backward compatibility)"""
+    models = get_available_models()
+    if models:
+        # Prefer Hugging Face models over local ones
+        # First try to find HF low-cost variant
+        for display_name in models:
+            if "11-33-40" in display_name and "[Hugging Face]" in display_name:
+                return get_model_paths_from_selection(display_name)
+        # Then try local low-cost variant
+        for display_name in models:
+            if "11-33-40" in display_name:
+                return get_model_paths_from_selection(display_name)
+        # If no low-cost variant, return the first available
+        first_model_name = next(iter(models.keys()))
+        return get_model_paths_from_selection(first_model_name)
+    return None, None
+def get_model_paths_from_selection(model_selection: str) -> Tuple[Optional[str], Optional[str]]:
+    """Get model and config paths from the selected model"""
+    models = get_available_models()
+    # Check if it's in our models dict
+    if model_selection in models:
+        model_info = models[model_selection]
+        # If it's a Hugging Face model, download it first (or get from cache)
+        if model_info["source"] == "huggingface":
+            variant = model_info["variant"]
+            try:
+                logging.info(f"📦 Retrieving {variant} model from cache...")
+                model_path, config_path = download_model_from_hf(variant, force_download=False)
+                return model_path, config_path
+            except Exception as e:
+                logging.error(f"Failed to get model {variant} from cache: {e}")
+                return None, None
+        else:
+            # Local model
+            logging.info(f"📁 Using local model: {model_selection}")
+            return model_info["model_path"], model_info["config_path"]
+    # Handle direct HF model selection (fallback)
+    elif "[Hugging Face]" in model_selection:
+        if "11-33-40" in model_selection:
+            variant = "11-33-40"
+        elif "23-51-11" in model_selection:
+            variant = "23-51-11"
+        else:
+            logging.error(f"Unknown HF model variant in: {model_selection}")
+            return None, None
+        try:
+            logging.info(f"📦 Retrieving {variant} model from cache...")
+            model_path, config_path = download_model_from_hf(variant, force_download=False)
+            return model_path, config_path
+        except Exception as e:
+            logging.error(f"Failed to get model {variant} from cache: {e}")
+            return None, None
+    return None, None
+def get_cached_model(model_selection: str):
+    """Get cached model or load new one if selection changed"""
+    global _cached_model, _cached_device, _cached_model_selection
+    # Get model paths from selection
+    model_path, config_path = get_model_paths_from_selection(model_selection)
+    if model_path is None or config_path is None:
+        raise ValueError(f"Selected model not found: {model_selection}")
+    # Load model fresh for each inference
+    # Since models are pre-downloaded, this should be fast
+    logging.info(f"🚀 Loading cached model: {model_selection}")
+    model, device = load_model_for_inference(model_path, config_path)
+    logging.info(f"✅ Model loaded successfully from cache: {model_selection}")
+    return model, device
+def clear_model_cache():
+    """Clear the cached model to free memory"""
+    global _cached_model, _cached_device, _cached_model_selection
+    if _cached_model is not None:
+        logging.info("Clearing model cache...")
+        del _cached_model
+        _cached_model = None
+        _cached_device = None
+        _cached_model_selection = None
+        # Simple cleanup
+        import gc
+        gc.collect()
+        logging.info("Model cache cleared")
+    else:
+        logging.info("No model in cache to clear")
+def load_model_for_inference(model_path: str, cfg_path: str):
+    """Load model temporarily for inference"""
+    # Set CUDA settings
+    torch.set_default_tensor_type('torch.cuda.FloatTensor')
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    # Import required modules
+    try:
+        # Import selectively to avoid CUDA calls in Utils
+        from core.foundation_stereo import FoundationStereo
+        from omegaconf import OmegaConf
+        logging.info("Successfully imported required modules")
+        # Import set_logging_format safely
+        from Utils import set_logging_format
+        set_logging_format()
+        # Manual seed setting to avoid CUDA calls in Utils.set_seed
+        import random
+        random_seed = 0
+        np.random.seed(random_seed)
+        random.seed(random_seed)
+        torch.manual_seed(random_seed)
+        # CUDA seeding will be done after device is available
+        logging.info("Set logging format and seed")
+    except Exception as e:
+        logging.error(f"Failed to import modules: {e}")
+        raise RuntimeError(f"Import failed: {e}")
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available.")
+    # Use the first available CUDA device
+    device = torch.device("cuda")
+    # Set CUDA seed
+    try:
+        torch.cuda.manual_seed_all(random_seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    except Exception as e:
+        logging.warning(f"Could not set CUDA seed: {e}")
+    try:
+        # Load config
+        cfg = OmegaConf.load(cfg_path)
+        cfg.setdefault("vit_size", "vitl")
+        logging.info("Loaded config file")
+        # Create model
+        model = FoundationStereo(cfg).to(device)
+        model.eval()
+        logging.info("Created model")
+        # Load checkpoint
+        ckpt = torch.load(model_path, map_location=device)
+        model.load_state_dict(ckpt["model"], strict=True)
+        logging.info("Loaded model weights")
+        # Memory optimizations
+        torch.set_grad_enabled(False)
+        model.half()  # Use half precision
+        logging.info("Applied memory optimizations")
+        return model, device
+    except Exception as e:
+        logging.error(f"Model loading failed: {e}")
+        raise RuntimeError(f"Failed to load model: {e}")
+def process_stereo_pair(model_selection: str, left_image: str, right_image: str,
+                       progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], str]:
+    """
+    Main processing function for stereo pair (with model caching)
+    """
+    logging.info("Starting stereo pair processing...")
+    if left_image is None or right_image is None:
+        return None, "❌ Please upload both left and right images."
+    # Convert image paths to numpy arrays
+    logging.info(f"Loading images: left={left_image}, right={right_image}")
+    try:
+        # Load left image
+        if left_image is None:
+            return None, "❌ Please upload a left image."
+        # Check if file exists first
+        if not os.path.exists(left_image):
+            logging.error(f"Left image file does not exist: {left_image}")
+            return None, f"❌ Left image file not found: {left_image}"
+        logging.info(f"Loading left image from: {left_image}")
+        left_img = None
+        # Try multiple loading methods
+        try:
+            # Method 1: OpenCV
+            left_img = cv2.imread(left_image)
+            if left_img is not None:
+                left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGB)
+                logging.info("Left image loaded with OpenCV")
+        except Exception as e:
+            logging.warning(f"OpenCV failed for left image: {e}")
+        if left_img is None:
+            try:
+                # Method 2: PIL
+                from PIL import Image
+                with Image.open(left_image) as pil_img:
+                    left_img = np.array(pil_img.convert('RGB'))
+                logging.info("Left image loaded with PIL")
+            except Exception as e:
+                logging.warning(f"PIL failed for left image: {e}")
+        if left_img is None:
+            try:
+                # Method 3: imageio
+                left_img = imageio.imread(left_image)
+                if len(left_img.shape) == 3 and left_img.shape[2] == 4:
+                    # RGBA to RGB
+                    left_img = left_img[:, :, :3]
+                logging.info("Left image loaded with imageio")
+            except Exception as e:
+                logging.warning(f"imageio failed for left image: {e}")
+        if left_img is None:
+            return None, f"❌ Failed to load left image with any method: {left_image}"
+        # Load right image
+        if right_image is None:
+            return None, "❌ Please upload a right image."
+        # Check if file exists first
+        if not os.path.exists(right_image):
+            logging.error(f"Right image file does not exist: {right_image}")
+            return None, f"❌ Right image file not found: {right_image}"
+        logging.info(f"Loading right image from: {right_image}")
+        right_img = None
+        # Try multiple loading methods
+        try:
+            # Method 1: OpenCV
+            right_img = cv2.imread(right_image)
+            if right_img is not None:
+                right_img = cv2.cvtColor(right_img, cv2.COLOR_BGR2RGB)
+                logging.info("Right image loaded with OpenCV")
+        except Exception as e:
+            logging.warning(f"OpenCV failed for right image: {e}")
+        if right_img is None:
+            try:
+                # Method 2: PIL
+                from PIL import Image
+                with Image.open(right_image) as pil_img:
+                    right_img = np.array(pil_img.convert('RGB'))
+                logging.info("Right image loaded with PIL")
+            except Exception as e:
+                logging.warning(f"PIL failed for right image: {e}")
+        if right_img is None:
+            try:
+                # Method 3: imageio
+                right_img = imageio.imread(right_image)
+                if len(right_img.shape) == 3 and right_img.shape[2] == 4:
+                    # RGBA to RGB
+                    right_img = right_img[:, :, :3]
+                logging.info("Right image loaded with imageio")
+            except Exception as e:
+                logging.warning(f"imageio failed for right image: {e}")
+        if right_img is None:
+            return None, f"❌ Failed to load right image with any method: {right_image}"
+        # Update variables
+        left_image = left_img
+        right_image = right_img
+        logging.info(f"Images loaded successfully - Left: {left_image.shape}, Right: {right_image.shape}")
+    except Exception as e:
+        logging.error(f"Failed to load images: {e}")
+        return None, f"❌ Failed to load images: {str(e)}"
+    try:
+        # Import these inside to avoid early CUDA calls
+        logging.info("Importing required modules...")
+        from core.utils.utils import InputPadder
+        # Import vis_disparity safely - it shouldn't have CUDA calls but be careful
+        from Utils import vis_disparity
+        logging.info("✅ Successfully imported processing modules")
+        # Get cached model (will load if not cached or selection changed)
+        variant_name = model_selection.split('(')[1].split(')')[0] if '(' in model_selection else model_selection
+        progress(0.1, desc=f"Loading cached model ({variant_name})...")
+        logging.info("🚀 Getting cached model...")
+        model, device = get_cached_model(model_selection)
+        logging.info("✅ Cached model loaded successfully")
+        progress(0.2, desc="Preprocessing images...")
+        # Validate input images
+        if left_image.shape != right_image.shape:
+            return None, "❌ Left and right images must have the same dimensions."
+        H, W = left_image.shape[:2]
+        # Convert to torch tensors and ensure they are contiguous
+        img0 = torch.as_tensor(left_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        img1 = torch.as_tensor(right_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        # Pad images and ensure contiguity
+        padder = InputPadder(img0.shape, divis_by=32, force_square=False)
+        img0, img1 = padder.pad(img0, img1)
+        # Ensure padded tensors are contiguous
+        img0 = img0.contiguous()
+        img1 = img1.contiguous()
+        progress(0.5, desc="Running inference...")
+        # Process stereo pair with autocast and ensure clean memory state
+        torch.cuda.empty_cache()  # Clear any cached memory before inference
+        try:
+            with torch.amp.autocast("cuda", enabled=True):
+                # Ensure tensors are in the right format for cuDNN
+                if not img0.is_contiguous():
+                    img0 = img0.contiguous()
+                if not img1.is_contiguous():
+                    img1 = img1.contiguous()
+                disp = model.forward(img0, img1, iters=32, test_mode=True)
+        except RuntimeError as e:
+            if "cuDNN" in str(e):
+                # Fallback: disable cuDNN optimizations and retry
+                logging.warning(f"cuDNN error encountered, retrying with fallback: {e}")
+                torch.backends.cudnn.enabled = False
+                try:
+                    with torch.amp.autocast("cuda", enabled=True):
+                        disp = model.forward(img0, img1, iters=32, test_mode=True)
+                finally:
+                    torch.backends.cudnn.enabled = True  # Re-enable for future use
+            else:
+                raise e
+        # Unpad and convert to numpy
+        disp = padder.unpad(disp.float())
+        disp_cpu = disp.data.cpu().numpy().reshape(H, W)
+        progress(0.8, desc="Creating visualization...")
+        # Create visualization - ONLY disparity
+        disparity_vis = vis_disparity(disp_cpu)
+        result_image = disparity_vis
+        progress(1.0, desc="Complete!")
+        # Clean up intermediate tensors
+        del img0, img1, disp
+        # Clean up model after inference
+        del model
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Create status message
+        valid_mask = disp_cpu != np.inf
+        min_disp = disp_cpu[valid_mask].min() if valid_mask.any() else 0
+        max_disp = disp_cpu[valid_mask].max() if valid_mask.any() else 0
+        mean_disp = disp_cpu[valid_mask].mean() if valid_mask.any() else 0
+        # Get model variant for status
+        variant = model_selection.split('(')[1].split(')')[0] if '(' in model_selection else "Unknown"
+        # Check current memory usage
+        current_memory = torch.cuda.memory_allocated(0) / 1024**3
+        max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+        memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        status = f"""✅ Processing successful!
+🔧 Model: {variant}{memory_info}
+📊 Disparity Statistics:
+   • Range: {min_disp:.2f} - {max_disp:.2f}
+   • Mean: {mean_disp:.2f}
+   • Input size: {W}×{H}
+   • Valid pixels: {valid_mask.sum()}/{valid_mask.size}"""
+        return result_image, status
+    except Exception as e:
+        logging.error(f"Processing failed: {e}")
+        # Cleanup on error
+        if 'img0' in locals():
+            del img0
+        if 'img1' in locals():
+            del img1
+        if 'disp' in locals():
+            del disp
+        if 'model' in locals():
+            del model
+        # Clean up GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, f"❌ Error: {str(e)}"
+def process_with_depth(model_selection: str, left_image: str, right_image: str,
+                      camera_matrix: str, baseline: float,
+                      progress: gr.Progress = gr.Progress()) -> Tuple[Optional[np.ndarray], Optional[str], Optional[str], str]:
+    """
+    Process stereo pair and generate depth map and point cloud (with model caching)
+    """
+    from core.utils.utils import InputPadder
+    from Utils import vis_disparity
+    # Import Open3D
+    global OPEN3D_AVAILABLE
+    try:
+        import open3d as o3d
+        OPEN3D_AVAILABLE = True
+    except ImportError as e:
+        logging.warning(f"Open3D not available: {e}")
+        OPEN3D_AVAILABLE = False
+        return None, None, None, "❌ Open3D not available. Point cloud generation disabled."
+    if left_image is None or right_image is None:
+        return None, None, None, "❌ Please upload both left and right images."
+    # Convert image paths to numpy arrays
+    logging.info(f"Loading images: left={left_image}, right={right_image}")
+    try:
+        # Load left image
+        if left_image is None:
+            return None, None, None, "❌ Left image is None."
+        if not os.path.exists(left_image):
+            return None, None, None, f"❌ Left image file does not exist: {left_image}"
+        left_img = None
+        # Try OpenCV
+        try:
+            left_img = cv2.imread(left_image)
+            if left_img is not None:
+                left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGB)
+        except Exception as e:
+            logging.warning(f"OpenCV failed for left image: {e}")
+        # Try PIL if OpenCV fails
+        if left_img is None:
+            try:
+                from PIL import Image
+                left_img = np.array(Image.open(left_image).convert('RGB'))
+            except Exception as e:
+                logging.warning(f"PIL failed for left image: {e}")
+        # Try imageio if PIL fails
+        if left_img is None:
+            try:
+                import imageio
+                left_img = imageio.imread(left_image)
+                if left_img.ndim == 2:
+                    left_img = np.stack([left_img]*3, axis=-1)
+                elif left_img.shape[2] == 4:
+                    left_img = left_img[..., :3]
+            except Exception as e:
+                logging.warning(f"imageio failed for left image: {e}")
+        if left_img is None:
+            return None, None, None, f"❌ Could not load left image: {left_image}"
+        # Load right image
+        if right_image is None:
+            return None, None, None, "❌ Right image is None."
+        if not os.path.exists(right_image):
+            return None, None, None, f"❌ Right image file does not exist: {right_image}"
+        right_img = None
+        # Try OpenCV
+        try:
+            right_img = cv2.imread(right_image)
+            if right_img is not None:
+                right_img = cv2.cvtColor(right_img, cv2.COLOR_BGR2RGB)
+        except Exception as e:
+            logging.warning(f"OpenCV failed for right image: {e}")
+        # Try PIL if OpenCV fails
+        if right_img is None:
+            try:
+                from PIL import Image
+                right_img = np.array(Image.open(right_image).convert('RGB'))
+            except Exception as e:
+                logging.warning(f"PIL failed for right image: {e}")
+        # Try imageio if PIL fails
+        if right_img is None:
+            try:
+                import imageio
+                right_img = imageio.imread(right_image)
+                if right_img.ndim == 2:
+                    right_img = np.stack([right_img]*3, axis=-1)
+                elif right_img.shape[2] == 4:
+                    right_img = right_img[..., :3]
+            except Exception as e:
+                logging.warning(f"imageio failed for right image: {e}")
+        if right_img is None:
+            return None, None, None, f"❌ Could not load right image: {right_image}"
+        # Update variables
+        left_image = left_img
+        right_image = right_img
+        logging.info(f"Images loaded successfully - Left: {left_image.shape}, Right: {right_image.shape}")
+    except Exception as e:
+        logging.error(f"Failed to load images: {e}")
+        return None, None, None, f"❌ Failed to load images: {str(e)}"
+    try:
+        progress(0.1, desc="Parsing camera parameters...")
+        # Parse camera matrix
+        try:
+            K_values = list(map(float, camera_matrix.strip().split()))
+            if len(K_values) != 9:
+                return None, None, None, "❌ Camera matrix must contain exactly 9 values."
+            K = np.array(K_values).reshape(3, 3)
+        except ValueError:
+            return None, None, None, "❌ Invalid camera matrix format. Use space-separated numbers."
+        if baseline <= 0:
+            return None, None, None, "❌ Baseline must be positive."
+        variant = model_selection.split('(')[1].split(')')[0] if '(' in model_selection else "Unknown"
+        progress(0.2, desc=f"Loading cached model ({variant})...")
+        # Get cached model (will load if not cached or selection changed)
+        model, device = get_cached_model(model_selection)
+        progress(0.4, desc="Running stereo inference...")
+        # Get disparity using the same process as the basic function
+        H, W = left_image.shape[:2]
+        img0 = torch.as_tensor(left_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        img1 = torch.as_tensor(right_image).to(device).half()[None].permute(0,3,1,2).contiguous()
+        padder = InputPadder(img0.shape, divis_by=32, force_square=False)
+        img0, img1 = padder.pad(img0, img1)
+        # Ensure padded tensors are contiguous
+        img0 = img0.contiguous()
+        img1 = img1.contiguous()
+        # Clear cache and ensure clean memory state before inference
+        torch.cuda.empty_cache()
+        try:
+            with torch.amp.autocast("cuda", enabled=True):
+                # Double-check tensor contiguity before cuDNN operations
+                if not img0.is_contiguous():
+                    img0 = img0.contiguous()
+                if not img1.is_contiguous():
+                    img1 = img1.contiguous()
+                disp = model.forward(img0, img1, iters=32, test_mode=True)
+        except RuntimeError as e:
+            if "cuDNN" in str(e):
+                # Fallback: disable cuDNN optimizations and retry
+                logging.warning(f"cuDNN error encountered in depth processing, retrying with fallback: {e}")
+                torch.backends.cudnn.enabled = False
+                try:
+                    with torch.amp.autocast("cuda", enabled=True):
+                        disp = model.forward(img0, img1, iters=32, test_mode=True)
+                finally:
+                    torch.backends.cudnn.enabled = True  # Re-enable for future use
+            else:
+                raise e
+        disp = padder.unpad(disp.float())
+        disp_cpu = disp.data.cpu().numpy().reshape(H, W)
+        # Clean up intermediate tensors early
+        del img0, img1, disp
+        # Keep model reference for rest of processing
+        torch.cuda.empty_cache()
+        progress(0.6, desc="Converting to depth...")
+        # Remove invisible points (same as in original demo)
+        yy, xx = np.meshgrid(np.arange(disp_cpu.shape[0]), np.arange(disp_cpu.shape[1]), indexing='ij')
+        us_right = xx - disp_cpu
+        invalid = us_right < 0
+        disp_cpu[invalid] = np.inf
+        # Convert to depth using the formula from the original demo
+        depth = K[0, 0] * baseline / disp_cpu
+        # Visualize depth (no rotation)
+        depth_vis = vis_disparity(depth, max_val=10.0)
+        progress(0.8, desc="Generating point cloud...")
+        # Generate point cloud with proper coordinate transformation
+        fx, fy = K[0, 0], K[1, 1]
+        cx, cy = K[0, 2], K[1, 2]
+        # Create coordinate meshgrids
+        u, v = np.meshgrid(np.arange(W), np.arange(H))
+        # Convert to 3D coordinates (proper camera coordinate system)
+        valid_depth = depth != np.inf
+        z = depth[valid_depth]  # Z coordinate (depth)
+        x = (u[valid_depth] - cx) * z / fx  # X coordinate
+        y = (v[valid_depth] - cy) * z / fy  # Y coordinate
+        # Stack coordinates (X, Y, Z)
+        points = np.stack([x, y, z], axis=-1)
+        # Get corresponding colors
+        colors = left_image[valid_depth]
+        # Filter points by depth range
+        depth_mask = (z > 0) & (z <= 10.0)
+        valid_points = points[depth_mask]
+        valid_colors = colors[depth_mask]
+        if len(valid_points) == 0:
+            return depth_vis, None, None, "⚠️ No valid points generated for point cloud."
+        # Subsample points for better 3D visualization performance
+        if len(valid_points) > 100000:
+            indices = np.random.choice(len(valid_points), 100000, replace=False)
+            valid_points = valid_points[indices]
+            valid_colors = valid_colors[indices]
+        # Transform coordinates for proper visualization orientation
+        # Standard computer vision: X right, Y down, Z forward
+        # For better 3D viewing: X right, Y up, Z backward
+        transformed_points = valid_points.copy()
+        transformed_points[:, 1] = -transformed_points[:, 1]  # Flip Y axis
+        transformed_points[:, 2] = -transformed_points[:, 2]  # Flip Z axis
+        # Generate point cloud using transformed coordinates
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(transformed_points)
+        pcd.colors = o3d.utility.Vector3dVector(valid_colors / 255.0)
+        progress(1.0, desc="Complete!")
+        # Clean up model after inference
+        del model
+        torch.cuda.empty_cache()
+        gc.collect()
+        # Check current memory usage
+        current_memory = torch.cuda.memory_allocated(0) / 1024**3
+        max_memory = torch.cuda.max_memory_allocated(0) / 1024**3
+        memory_info = f" | GPU: {current_memory:.2f}GB/{max_memory:.2f}GB peak"
+        status = f"""✅ Depth processing successful!
+🔧 Model: {variant}{memory_info}
+📊 Statistics:
+   • Valid points: {len(valid_points):,}
+   • Depth range: {z.min():.2f} - {z.max():.2f} m
+   • Baseline: {baseline} m
+   • Point cloud generated with {len(valid_points)} points (not saved to file)
+   • 3D visualization available (in-memory)"""
+        return depth_vis, None, None, status
+    except Exception as e:
+        logging.error(f"Depth processing failed: {e}")
+        # Cleanup on error
+        if 'img0' in locals():
+            del img0
+        if 'img1' in locals():
+            del img1
+        if 'disp' in locals():
+            del disp
+        if 'model' in locals():
+            del model
+        # Clean up GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        return None, None, None, f"❌ Error: {str(e)}"
+def preload_all_models():
+    """Pre-download all Hugging Face models to cache during startup"""
+    logging.info("🔄 Pre-downloading all models to cache...")
+    downloaded_models = {}
+    for variant, info in MODEL_VARIANTS.items():
+        try:
+            logging.info(f"📥 Downloading {variant} model to cache...")
+            model_path, config_path = download_model_from_hf(variant, force_download=False)
+            downloaded_models[variant] = {
+                "model_path": model_path,
+                "config_path": config_path,
+                "display_name": info["display_name"]
+            }
+            logging.info(f"✅ {variant} model cached successfully")
+        except Exception as e:
+            logging.warning(f"⚠️ Failed to download {variant} model: {e}")
+            # Continue with other models even if one fails
+    logging.info(f"✅ Model pre-loading complete. {len(downloaded_models)}/{len(MODEL_VARIANTS)} models cached.")
+    return downloaded_models
+def create_app() -> gr.Blocks:
+    """Create the Gradio application"""
+    global MODEL_PATH, CONFIG_PATH
+    # Debug: Print current directory and check for files
+    print(f"Current directory: {current_dir}")
+    print(f"Python working directory: {os.getcwd()}")
+    # Pre-download all models to cache
+    try:
+        cached_models = preload_all_models()
+        logging.info(f"Pre-loaded {len(cached_models)} models to cache")
+    except Exception as e:
+        logging.error(f"Failed to pre-load models: {e}")
+        cached_models = {}
+    # Get available models (this should be safe as it only does file system operations)
+    try:
+        available_models = get_available_models()
+        logging.info(f"Successfully got available models: {len(available_models)} found")
+    except Exception as e:
+        logging.error(f"Failed to get available models: {e}")
+        available_models = {}
+    # Find model and config paths (legacy) - should be safe as well
+    try:
+        MODEL_PATH, CONFIG_PATH = find_model_path()
+        logging.info("Successfully found model paths")
+    except Exception as e:
+        logging.error(f"Failed to find model paths: {e}")
+        MODEL_PATH, CONFIG_PATH = None, None
+    with gr.Blocks(
+        title="FoundationStereo - Stereo Depth Estimation",
+        theme=gr.themes.Soft(),
+        css="footer {visibility: hidden}",
+        delete_cache=(60, 60)  # Delete cache after 60 seconds
+    ) as app:
+        gr.Markdown("""
+        # 🔍 FoundationStereo: Zero-Shot Stereo Matching
+        Upload a pair of **rectified** stereo images to get disparity estimation.
+        ⚠️ **Important**: Images should be rectified (epipolar lines are horizontal) and undistorted.
+        ⚡ **GPU Powered**: Runs on high-performance GPUs for fast inference.
+        📦 **Smart Caching**: All models are pre-downloaded for instant model switching.
+        """)
+        # Instructions section
+        with gr.Accordion("📋 Instructions to Run This Repository", open=False):
+            gr.Markdown("""
+            ## 🚀 How to Run This Demo
+            This is a **demo application** showcasing the FoundationStereo model for stereo matching estimation.
+            ### 🖼️ Input Requirements
+            1. **Image Format**: Upload images in JPEG or PNG format.
+            2. **Image Size**: Images should be of the same size and resolution.
+            3. **Rectification**: Ensure images are rectified (epipolar lines are horizontal) and undistorted.
+            4. **Camera Parameters**: For advanced processing, provide camera parameters (camera matrix and baseline).
+            ### 📊 Using the Demo
+            1. **Select Model**: Choose between low-cost (11-33-40) or high-quality (23-51-11) variants
+            2. **Upload Images**: Provide rectified stereo image pairs
+            3. **Basic Processing**: Get disparity visualization
+            4. **Advanced Processing**: Generate depth maps and 3D point clouds (requires camera parameters)
+            ###  Original Work
+            This demo is based on the original FoundationStereo research. Please visit the official resources:
+            - **Paper**: [FoundationStereo: Zero-Shot Stereo Matching via Foundation Model](https://arxiv.org/abs/2501.09898)
+            - **Project Page**: [https://nvlabs.github.io/FoundationStereo/](https://nvlabs.github.io/FoundationStereo/)
+            - **Official Repository**: [https://github.com/NVlabs/FoundationStereo](https://github.com/NVlabs/FoundationStereo)
+            **⚠️ Demo Notice**: This is a demonstration interface. For research and production use, please refer to the original repository and follow the official implementation guidelines.
+            """)
+        # Model selection
+        with gr.Row():
+            # Always include Hugging Face models in the choices
+            all_choices = list(available_models.keys())
+            # If no models found, add the HF models manually
+            if not all_choices:
+                all_choices = [
+                    "FoundationStereo (Low-cost variant - 11-33-40) [Hugging Face]",
+                    "FoundationStereo (High-quality variant - 23-51-11) [Hugging Face]"
+                ]
+            # Get default model (prefer Hugging Face low-cost variant)
+            default_model = None
+            # First try Hugging Face low-cost variant
+            for name in all_choices:
+                if "11-33-40" in name and "[Hugging Face]" in name:
+                    default_model = name
+                    break
+            # If no HF low-cost variant, try any low-cost variant
+            if default_model is None:
+                for name in all_choices:
+                    if "11-33-40" in name:
+                        default_model = name
+                        break
+            # If no low-cost variant, use first available
+            if default_model is None:
+                default_model = all_choices[0] if all_choices else None
+            model_selector = gr.Dropdown(
+                choices=all_choices,
+                value=default_model,
+                label="🎯 Select Model",
+                info="Choose the FoundationStereo model variant. Hugging Face models download automatically.",
+                interactive=True
+            )
+        with gr.Tabs():
+            # Basic stereo processing tab
+            with gr.TabItem("🖼️ Basic Stereo Processing"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input = gr.Image(
+                            label="📷 Left Image",
+                            type="filepath",
+                            height=300
+                        )
+                        right_input = gr.Image(
+                            label="📷 Right Image",
+                            type="filepath",
+                            height=300
+                        )
+                        process_btn = gr.Button(
+                            "🚀 Process Stereo Pair",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        output_image = gr.Image(
+                            label="📊 Disparity Visualization",
+                            height=400
+                        )
+                        status_text = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=8
+                        )
+                # Example images
+                examples_list = []
+                # Example 1
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png")
+                    ])
+                # Example 2
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    examples_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png")
+                    ])
+                gr.Examples(
+                    examples=examples_list,
+                    inputs=[left_input, right_input],
+                    label="📋 Example Images"
+                )
+            # Advanced processing with depth
+            with gr.TabItem("📐 Advanced Processing (Depth & Point Cloud)"):
+                with gr.Row():
+                    with gr.Column():
+                        left_input_adv = gr.Image(
+                            label="📷 Left Image",
+                            type="filepath",
+                            height=250
+                        )
+                        right_input_adv = gr.Image(
+                            label="📷 Right Image",
+                            type="filepath",
+                            height=250
+                        )
+                        # Camera parameters
+                        with gr.Group():
+                            gr.Markdown("### 📹 Camera Parameters")
+                            camera_matrix_input = gr.Textbox(
+                                label="Camera Matrix (9 values: fx 0 cx 0 fy cy 0 0 1)",
+                                value="",
+                            )
+                            baseline_input = gr.Number(
+                                label="Baseline (meters)",
+                                value=None,
+                                minimum=0.001,
+                                maximum=10.0,
+                                step=0.001
+                            )
+                        process_depth_btn = gr.Button(
+                            "🔬 Process with Depth",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column():
+                        depth_output = gr.Image(
+                            label="📏 Depth Visualization",
+                            height=300
+                        )
+                        pointcloud_output = gr.File(
+                            label="☁️ Point Cloud Download (.ply)",
+                            file_types=[".ply"]
+                        )
+                        status_depth = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=6
+                        )
+                # 3D Point Cloud Visualization
+                with gr.Row():
+                    pointcloud_3d = gr.Model3D(
+                        label="🌐 3D Point Cloud Viewer",
+                        clear_color=[0.0, 0.0, 0.0, 0.0],
+                        height=400
+                    )
+                # Example images for advanced processing
+                examples_advanced_list = []
+                # Example 1 - Camera parameters from K.txt
+                if os.path.exists(os.path.join(current_dir, "assets", "example1", "left.png")):
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example1", "left.png"),
+                        os.path.join(current_dir, "assets", "example1", "right.png"),
+                        "754.6680908203125 0.0 489.3794860839844 0.0 754.6680908203125 265.16162109375 0.0 0.0 1.0",  # Camera matrix
+                        0.063  # Baseline in meters
+                    ])
+                # Example 2 - Camera parameters from K.txt
+                if os.path.exists(os.path.join(current_dir, "assets", "example2", "left.png")):
+                    examples_advanced_list.append([
+                        os.path.join(current_dir, "assets", "example2", "left.png"),
+                        os.path.join(current_dir, "assets", "example2", "right.png"),
+                        "1733.74 0.0 792.27 0.0 1733.74 541.89 0.0 0.0 1.0",  # Camera matrix
+                        0.537  # Baseline in meters (converted from 536.62mm)
+                    ])
+                gr.Examples(
+                    examples=examples_advanced_list,
+                    inputs=[left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                    label="📋 Example Images with Camera Parameters"
+                )
+        # Event handlers - Always enable since we have HF models
+        process_btn.click(
+            fn=process_stereo_pair,
+            inputs=[model_selector, left_input, right_input],
+            outputs=[output_image, status_text],
+            show_progress=True
+        )
+        if OPEN3D_AVAILABLE:
+            process_depth_btn.click(
+                fn=process_with_depth,
+                inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth],
+                show_progress=True
+            )
+        else:
+            process_depth_btn.click(
+                fn=lambda *args: (None, None, None, "❌ Open3D not available. Install with: pip install open3d"),
+                inputs=[model_selector, left_input_adv, right_input_adv, camera_matrix_input, baseline_input],
+                outputs=[depth_output, pointcloud_output, pointcloud_3d, status_depth]
+            )
+        # Citation section at the bottom
+        with gr.Accordion("📖 Citation", open=False):
+            gr.Markdown("""
+            ### 📄 Please Cite the Original Paper
+            If you use this work in your research, please cite:
+            ```bibtex
+            @article{wen2025stereo,
+              title={FoundationStereo: Zero-Shot Stereo Matching},
+              author={Bowen Wen and Matthew Trepte and Joseph Aribido and Jan Kautz and Orazio Gallo and Stan Birchfield},
+              journal={CVPR},
+              year={2025}
+            }
+            ```
+            """)
+        # Footer
+        gr.Markdown(f"""
+        ---
+        ### 📝 Notes:
+        - **Input images must be rectified stereo pairs** (epipolar lines are horizontal)
+        - **🤗 Hugging Face Integration**: Models are automatically downloaded from `{HF_REPO_ID}`
+        - **📦 Smart Caching**: All models are pre-downloaded and cached for instant switching
+        - **⚡ GPU Acceleration**: Powered by high-performance GPUs
+        - For best results, use PNG images without lossy compression
+        - Model works on RGB images but also supports monochrome/IR stereo pairs
+        - **Optimized for Performance**: Memory-efficient inference
+        ### 🔗 References:
+        - [FoundationStereo Paper](https://arxiv.org/abs/2501.09898)
+        - [Project Website](https://nvlabs.github.io/FoundationStereo/)
+        - [GitHub Repository](https://github.com/NVlabs/FoundationStereo)
+        - [Hugging Face Models]({f"https://huggingface.co/{HF_REPO_ID}"})
+        """)
+    return app
+def main():
+    """Main function to launch the app"""
+    # Ensure no CUDA operations during startup
+    if torch.cuda.is_available():
+        logging.warning("CUDA detected during startup")
+    logging.info("🚀 Starting FoundationStereo Gradio App...")
+    # Parse command line arguments
+    import argparse
+    parser = argparse.ArgumentParser(description="FoundationStereo Gradio App")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    try:
+        # Create and launch app
+        logging.info("Creating Gradio app...")
+        app = create_app()
+        logging.info("✅ Gradio app created successfully")
+        logging.info(f"Launching app on {args.host}:{args.port}")
+        # Launch with appropriate settings
+        app.launch(
+            server_name=args.host,
+            server_port=args.port,
+            share=False,
+            show_error=True,
+            favicon_path=None,
+            ssr_mode=False,  # Disable SSR for compatibility
+            allowed_paths=["./"]  # Allow access to local files
+        )
+    except Exception as e:
+        logging.error(f"Failed to launch app: {e}")
+        raise
+if __name__ == "__main__":
+    # Additional safety check for Spaces environment
+    if 'SPACE_ID' in os.environ:
+        logging.info("Running in Hugging Face Spaces environment")
+    # Do not check CUDA status during startup - this can trigger CUDA initialization
+    # The CUDA status will be checked inside the GPU decorated functions
+    logging.info("✅ CUDA status will be checked within GPU functions")
+    main()

FoundationStereo_demo/core/extractor.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch,logging,os,sys,urllib,warnings
+import torch.nn as nn
+import torch.nn.functional as F
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(f'{code_dir}/../')
+from core.submodule import *
+from Utils import *
+import timm
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn=='layer':
+            self.norm1 = LayerNorm2d(planes)
+            self.norm2 = LayerNorm2d(planes)
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = LayerNorm2d(planes)
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not (stride == 1 and in_planes == planes):
+                self.norm3 = nn.Sequential()
+        if stride == 1 and in_planes == planes:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+    def forward(self, x):
+        y = x
+        y = self.conv1(y)
+        y = self.norm1(y)
+        y = self.relu(y)
+        y = self.conv2(y)
+        y = self.norm2(y)
+        y = self.relu(y)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x+y)
+class MultiBasicEncoder(nn.Module):
+    def __init__(self, output_dim=[128], norm_fn='batch', dropout=0.0, downsample=3):
+        super(MultiBasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        self.downsample = downsample
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn=='layer':
+            self.norm1 = LayerNorm2d(64)
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1 + (downsample > 2), padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=1 + (downsample > 1))
+        self.layer3 = self._make_layer(128, stride=1 + (downsample > 0))
+        self.layer4 = self._make_layer(128, stride=2)
+        self.layer5 = self._make_layer(128, stride=2)
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Sequential(
+                ResidualBlock(128, 128, self.norm_fn, stride=1),
+                nn.Conv2d(128, dim[2], 3, padding=1))
+            output_list.append(conv_out)
+        self.outputs04 = nn.ModuleList(output_list)
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Sequential(
+                ResidualBlock(128, 128, self.norm_fn, stride=1),
+                nn.Conv2d(128, dim[1], 3, padding=1))
+            output_list.append(conv_out)
+        self.outputs08 = nn.ModuleList(output_list)
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Conv2d(128, dim[0], 3, padding=1)
+            output_list.append(conv_out)
+        self.outputs16 = nn.ModuleList(output_list)
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        else:
+            self.dropout = None
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x, dual_inp=False, num_layers=3):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        if dual_inp:
+            v = x
+            x = x[:(x.shape[0]//2)]
+        outputs04 = [f(x) for f in self.outputs04]
+        if num_layers == 1:
+            return (outputs04, v) if dual_inp else (outputs04,)
+        y = self.layer4(x)
+        outputs08 = [f(y) for f in self.outputs08]
+        if num_layers == 2:
+            return (outputs04, outputs08, v) if dual_inp else (outputs04, outputs08)
+        z = self.layer5(y)
+        outputs16 = [f(z) for f in self.outputs16]
+        return (outputs04, outputs08, outputs16, v) if dual_inp else (outputs04, outputs08, outputs16)
+class ContextNetDino(MultiBasicEncoder):
+    def __init__(self, args, output_dim=[128], norm_fn='batch', downsample=3):
+        nn.Module.__init__(self)
+        self.args = args
+        self.patch_size = 14
+        self.image_size = 518
+        self.vit_feat_dim = 384
+        code_dir = os.path.dirname(os.path.realpath(__file__))
+        self.out_dims = output_dim
+        self.norm_fn = norm_fn
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn=='layer':
+            self.norm1 = LayerNorm2d(64)
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1 + (downsample > 2), padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=1 + (downsample > 1))
+        self.layer3 = self._make_layer(128, stride=1 + (downsample > 0))
+        self.layer4 = self._make_layer(128, stride=2)
+        self.layer5 = self._make_layer(128, stride=2)
+        self.down = nn.Sequential(
+          nn.Conv2d(128, 128, kernel_size=4, stride=4, padding=0),
+          nn.BatchNorm2d(128),
+        )
+        vit_dim = DepthAnythingFeature.model_configs[self.args.vit_size]['features']//2
+        self.conv2 = BasicConv(128+vit_dim, 128, kernel_size=3, padding=1)
+        self.norm = nn.BatchNorm2d(256)
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Sequential(
+                ResidualBlock(128, 128, self.norm_fn, stride=1),
+                nn.Conv2d(128, dim[2], 3, padding=1))
+            output_list.append(conv_out)
+        self.outputs04 = nn.ModuleList(output_list)
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Sequential(
+                ResidualBlock(128, 128, self.norm_fn, stride=1),
+                nn.Conv2d(128, dim[1], 3, padding=1))
+            output_list.append(conv_out)
+        self.outputs08 = nn.ModuleList(output_list)
+        output_list = []
+        for dim in output_dim:
+            conv_out = nn.Conv2d(128, dim[0], 3, padding=1)
+            output_list.append(conv_out)
+        self.outputs16 = nn.ModuleList(output_list)
+    def forward(self, x_in, vit_feat, dual_inp=False, num_layers=3):
+        B,C,H,W = x_in.shape
+        x = self.conv1(x_in)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        divider = np.lcm(self.patch_size, 16)
+        H_resize, W_resize = get_resize_keep_aspect_ratio(H,W, divider=divider, max_H=1344, max_W=1344)
+        x = torch.cat([x, vit_feat], dim=1)
+        x = self.conv2(x)
+        outputs04 = [f(x) for f in self.outputs04]
+        y = self.layer4(x)
+        outputs08 = [f(y) for f in self.outputs08]
+        z = self.layer5(y)
+        outputs16 = [f(z) for f in self.outputs16]
+        return (outputs04, outputs08, outputs16)
+class DepthAnythingFeature(nn.Module):
+    model_configs = {
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}
+    }
+    def __init__(self, encoder='vits'):
+        super().__init__()
+        from depth_anything.dpt import DepthAnything
+        self.encoder = encoder
+        depth_anything = DepthAnything(self.model_configs[encoder])
+        self.depth_anything = depth_anything
+        self.intermediate_layer_idx = {   #!NOTE For V2
+            'vits': [2, 5, 8, 11],
+            'vitb': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23],
+            'vitg': [9, 19, 29, 39]
+        }
+    def forward(self, x):
+        """
+        @x: (B,C,H,W)
+        """
+        h, w = x.shape[-2:]
+        features = self.depth_anything.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        patch_size = self.depth_anything.pretrained.patch_size
+        patch_h, patch_w = h // patch_size, w // patch_size
+        out, path_1, path_2, path_3, path_4, disp = self.depth_anything.depth_head.forward(features, patch_h, patch_w, return_intermediate=True)
+        return {'out':out, 'path_1':path_1, 'path_2':path_2, 'path_3':path_3, 'path_4':path_4, 'features':features, 'disp':disp}  # path_1 is 1/2; path_2 is 1/4
+class Feature(nn.Module):
+    def __init__(self, args):
+        super(Feature, self).__init__()
+        self.args = args
+        model = timm.create_model('edgenext_small', pretrained=True, features_only=False)
+        self.stem = model.stem
+        self.stages = model.stages
+        chans = [48, 96, 160, 304]
+        self.chans = chans
+        self.dino = DepthAnythingFeature(encoder=self.args.vit_size)
+        self.dino = freeze_model(self.dino)
+        vit_feat_dim = DepthAnythingFeature.model_configs[self.args.vit_size]['features']//2
+        self.deconv32_16 = Conv2x_IN(chans[3], chans[2], deconv=True, concat=True)
+        self.deconv16_8 = Conv2x_IN(chans[2]*2, chans[1], deconv=True, concat=True)
+        self.deconv8_4 = Conv2x_IN(chans[1]*2, chans[0], deconv=True, concat=True)
+        self.conv4 = nn.Sequential(
+          BasicConv(chans[0]*2+vit_feat_dim, chans[0]*2+vit_feat_dim, kernel_size=3, stride=1, padding=1, norm='instance'),
+          ResidualBlock(chans[0]*2+vit_feat_dim, chans[0]*2+vit_feat_dim, norm_fn='instance'),
+          ResidualBlock(chans[0]*2+vit_feat_dim, chans[0]*2+vit_feat_dim, norm_fn='instance'),
+        )
+        self.patch_size = 14
+        self.d_out = [chans[0]*2+vit_feat_dim, chans[1]*2, chans[2]*2, chans[3]]
+    def forward(self, x):
+        B,C,H,W = x.shape
+        divider = np.lcm(self.patch_size, 16)
+        H_resize, W_resize = get_resize_keep_aspect_ratio(H,W, divider=divider, max_H=1344, max_W=1344)
+        x_in_ = F.interpolate(x, size=(H_resize, W_resize), mode='bicubic', align_corners=False)
+        self.dino = self.dino.eval()
+        with torch.no_grad():
+          output = self.dino(x_in_)
+        vit_feat = output['out']
+        vit_feat = F.interpolate(vit_feat, size=(H//4,W//4), mode='bilinear', align_corners=True)
+        x = self.stem(x)
+        x4 = self.stages[0](x)
+        x8 = self.stages[1](x4)
+        x16 = self.stages[2](x8)
+        x32 = self.stages[3](x16)
+        x16 = self.deconv32_16(x32, x16)
+        x8 = self.deconv16_8(x16, x8)
+        x4 = self.deconv8_4(x8, x4)
+        x4 = torch.cat([x4, vit_feat], dim=1)
+        x4 = self.conv4(x4)
+        return [x4, x8, x16, x32], vit_feat

FoundationStereo_demo/core/foundation_stereo.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch,pdb,logging,timm
+import torchvision  # Add missing torchvision import
+import torch.nn as nn
+import torch.nn.functional as F
+import sys,os
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(f'{code_dir}/../')
+from core.update import *
+from core.extractor import *
+from core.geometry import Combined_Geo_Encoding_Volume
+from core.submodule import *
+from core.utils.utils import *
+from Utils import *
+import time,huggingface_hub
+try:
+    autocast = torch.cuda.amp.autocast
+except:
+    class autocast:
+        def __init__(self, enabled):
+            pass
+        def __enter__(self):
+            pass
+        def __exit__(self, *args):
+            pass
+def normalize_image(img):
+    '''
+    @img: (B,C,H,W) in range 0-255, RGB order
+    '''
+    tf = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False)
+    normalized = tf(img/255.0)
+    return normalized.contiguous()  # Ensure contiguous tensor
+class hourglass(nn.Module):
+    def __init__(self, cfg, in_channels, feat_dims=None):
+        super().__init__()
+        self.cfg = cfg
+        self.conv1 = nn.Sequential(BasicConv(in_channels, in_channels*2, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=2, dilation=1),
+                                   Conv3dNormActReduced(in_channels*2, in_channels*2, kernel_size=3, kernel_disp=17))
+        self.conv2 = nn.Sequential(BasicConv(in_channels*2, in_channels*4, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=2, dilation=1),
+                                   Conv3dNormActReduced(in_channels*4, in_channels*4, kernel_size=3, kernel_disp=17))
+        self.conv3 = nn.Sequential(BasicConv(in_channels*4, in_channels*6, is_3d=True, bn=True, relu=True, kernel_size=3,
+                                             padding=1, stride=2, dilation=1),
+                                   Conv3dNormActReduced(in_channels*6, in_channels*6, kernel_size=3, kernel_disp=17))
+        self.conv3_up = BasicConv(in_channels*6, in_channels*4, deconv=True, is_3d=True, bn=True,
+                                  relu=True, kernel_size=(4, 4, 4), padding=(1, 1, 1), stride=(2, 2, 2))
+        self.conv2_up = BasicConv(in_channels*4, in_channels*2, deconv=True, is_3d=True, bn=True,
+                                  relu=True, kernel_size=(4, 4, 4), padding=(1, 1, 1), stride=(2, 2, 2))
+        self.conv1_up = BasicConv(in_channels*2, in_channels, deconv=True, is_3d=True, bn=True,
+                                  relu=True, kernel_size=(4, 4, 4), padding=(1, 1, 1), stride=(2, 2, 2))
+        self.conv_out = nn.Sequential(
+          Conv3dNormActReduced(in_channels, in_channels, kernel_size=3, kernel_disp=17),
+          Conv3dNormActReduced(in_channels, in_channels, kernel_size=3, kernel_disp=17),
+        )
+        self.agg_0 = nn.Sequential(BasicConv(in_channels*8, in_channels*4, is_3d=True, kernel_size=1, padding=0, stride=1),
+                                   Conv3dNormActReduced(in_channels*4, in_channels*4, kernel_size=3, kernel_disp=17),
+                                   Conv3dNormActReduced(in_channels*4, in_channels*4, kernel_size=3, kernel_disp=17),)
+        self.agg_1 = nn.Sequential(BasicConv(in_channels*4, in_channels*2, is_3d=True, kernel_size=1, padding=0, stride=1),
+                                   Conv3dNormActReduced(in_channels*2, in_channels*2, kernel_size=3, kernel_disp=17),
+                                   Conv3dNormActReduced(in_channels*2, in_channels*2, kernel_size=3, kernel_disp=17))
+        self.atts = nn.ModuleDict({
+          "4": CostVolumeDisparityAttention(d_model=in_channels, nhead=4, dim_feedforward=in_channels, norm_first=False, num_transformer=4, max_len=self.cfg['max_disp']//16),
+        })
+        self.conv_patch = nn.Sequential(
+          nn.Conv3d(in_channels, in_channels, kernel_size=4, stride=4, padding=0, groups=in_channels),
+          nn.BatchNorm3d(in_channels),
+        )
+        self.feature_att_8 = FeatureAtt(in_channels*2, feat_dims[1])
+        self.feature_att_16 = FeatureAtt(in_channels*4, feat_dims[2])
+        self.feature_att_32 = FeatureAtt(in_channels*6, feat_dims[3])
+        self.feature_att_up_16 = FeatureAtt(in_channels*4, feat_dims[2])
+        self.feature_att_up_8 = FeatureAtt(in_channels*2, feat_dims[1])
+    def forward(self, x, features):
+        conv1 = self.conv1(x)
+        conv1 = self.feature_att_8(conv1, features[1])
+        conv2 = self.conv2(conv1)
+        conv2 = self.feature_att_16(conv2, features[2])
+        conv3 = self.conv3(conv2)
+        conv3 = self.feature_att_32(conv3, features[3])
+        conv3_up = self.conv3_up(conv3)
+        conv2 = torch.cat((conv3_up, conv2), dim=1)
+        conv2 = self.agg_0(conv2)
+        conv2 = self.feature_att_up_16(conv2, features[2])
+        conv2_up = self.conv2_up(conv2)
+        conv1 = torch.cat((conv2_up, conv1), dim=1)
+        conv1 = self.agg_1(conv1)
+        conv1 = self.feature_att_up_8(conv1, features[1])
+        conv = self.conv1_up(conv1)
+        x = self.conv_patch(x)
+        x = self.atts["4"](x)
+        x = F.interpolate(x, scale_factor=4, mode='trilinear', align_corners=False)
+        conv = conv + x
+        conv = self.conv_out(conv)
+        return conv
+class FoundationStereo(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        context_dims = args.hidden_dims
+        self.cv_group = 8
+        volume_dim = 28
+        self.cnet = ContextNetDino(args, output_dim=[args.hidden_dims, context_dims], downsample=args.n_downsample)
+        self.update_block = BasicSelectiveMultiUpdateBlock(self.args, self.args.hidden_dims[0], volume_dim=volume_dim)
+        self.sam = SpatialAttentionExtractor()
+        self.cam = ChannelAttentionEnhancement(self.args.hidden_dims[0])
+        self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, kernel_size=3, padding=3//2) for i in range(self.args.n_gru_layers)])
+        self.feature = Feature(args)
+        self.proj_cmb = nn.Conv2d(self.feature.d_out[0], 12, kernel_size=1, padding=0)
+        self.stem_2 = nn.Sequential(
+            BasicConv_IN(3, 32, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(32, 32, 3, 1, 1, bias=False),
+            nn.InstanceNorm2d(32), nn.ReLU()
+            )
+        self.stem_4 = nn.Sequential(
+            BasicConv_IN(32, 48, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(48, 48, 3, 1, 1, bias=False),
+            nn.InstanceNorm2d(48), nn.ReLU()
+            )
+        self.spx_2_gru = Conv2x(32, 32, True, bn=False)
+        self.spx_gru = nn.Sequential(
+          nn.ConvTranspose2d(2*32, 9, kernel_size=4, stride=2, padding=1),
+          )
+        self.corr_stem = nn.Sequential(
+            nn.Conv3d(32, volume_dim, kernel_size=1),
+            BasicConv(volume_dim, volume_dim, kernel_size=3, padding=1, is_3d=True),
+            ResnetBasicBlock3D(volume_dim, volume_dim, kernel_size=3, stride=1, padding=1),
+            ResnetBasicBlock3D(volume_dim, volume_dim, kernel_size=3, stride=1, padding=1),
+            )
+        self.corr_feature_att = FeatureAtt(volume_dim, self.feature.d_out[0])
+        self.cost_agg = hourglass(cfg=self.args, in_channels=volume_dim, feat_dims=self.feature.d_out)
+        self.classifier = nn.Sequential(
+          BasicConv(volume_dim, volume_dim//2, kernel_size=3, padding=1, is_3d=True),
+          ResnetBasicBlock3D(volume_dim//2, volume_dim//2, kernel_size=3, stride=1, padding=1),
+          nn.Conv3d(volume_dim//2, 1, kernel_size=7, padding=3),
+        )
+        r = self.args.corr_radius
+        dx = torch.linspace(-r, r, 2*r+1, requires_grad=False).reshape(1, 1, 2*r+1, 1)
+        self.dx = dx
+    def upsample_disp(self, disp, mask_feat_4, stem_2x):
+        with autocast(enabled=self.args.mixed_precision):
+            xspx = self.spx_2_gru(mask_feat_4, stem_2x)   # 1/2 resolution
+            spx_pred = self.spx_gru(xspx)
+            spx_pred = F.softmax(spx_pred, 1)
+            up_disp = context_upsample(disp*4., spx_pred).unsqueeze(1)
+        return up_disp.float()
+    def forward(self, image1, image2, iters=12, flow_init=None, test_mode=False, low_memory=False, init_disp=None):
+        """ Estimate disparity between pair of frames """
+        B = len(image1)
+        low_memory = low_memory or (self.args.get('low_memory', False))
+        image1 = normalize_image(image1)
+        image2 = normalize_image(image2)
+        with autocast(enabled=self.args.mixed_precision):
+            out, vit_feat = self.feature(torch.cat([image1, image2], dim=0))
+            vit_feat = vit_feat[:B]
+            features_left = [o[:B] for o in out]
+            features_right = [o[B:] for o in out]
+            stem_2x = self.stem_2(image1)
+            gwc_volume = build_gwc_volume(features_left[0], features_right[0], self.args.max_disp//4, self.cv_group)  # Group-wise correlation volume (B, N_group, max_disp, H, W)
+            left_tmp = self.proj_cmb(features_left[0])
+            right_tmp = self.proj_cmb(features_right[0])
+            concat_volume = build_concat_volume(left_tmp, right_tmp, maxdisp=self.args.max_disp//4)
+            del left_tmp, right_tmp
+            comb_volume = torch.cat([gwc_volume, concat_volume], dim=1)
+            comb_volume = self.corr_stem(comb_volume)
+            comb_volume = self.corr_feature_att(comb_volume, features_left[0])
+            comb_volume = self.cost_agg(comb_volume, features_left)
+            # Init disp from geometry encoding volume
+            prob = F.softmax(self.classifier(comb_volume).squeeze(1), dim=1)  #(B, max_disp, H, W)
+            if init_disp is None:
+              init_disp = disparity_regression(prob, self.args.max_disp//4)  # Weighted  sum of disparity
+            cnet_list = self.cnet(image1, vit_feat=vit_feat, num_layers=self.args.n_gru_layers)   #(1/4, 1/8, 1/16)
+            cnet_list = list(cnet_list)
+            net_list = [torch.tanh(x[0]) for x in cnet_list]   # Hidden information
+            inp_list = [torch.relu(x[1]) for x in cnet_list]   # Context information list of pyramid levels
+            inp_list = [self.cam(x) * x for x in inp_list]
+            att = [self.sam(x) for x in inp_list]
+        geo_fn = Combined_Geo_Encoding_Volume(features_left[0].float(), features_right[0].float(), comb_volume.float(), num_levels=self.args.corr_levels, dx=self.dx)
+        b, c, h, w = features_left[0].shape
+        coords = torch.arange(w, dtype=torch.float, device=init_disp.device).reshape(1,1,w,1).repeat(b, h, 1, 1)  # (B,H,W,1) Horizontal only
+        disp = init_disp.float()
+        disp_preds = []
+        # GRUs iterations to update disparity (1/4 resolution)
+        for itr in range(iters):
+            disp = disp.detach()
+            geo_feat = geo_fn(disp, coords, low_memory=low_memory)
+            with autocast(enabled=self.args.mixed_precision):
+              net_list, mask_feat_4, delta_disp = self.update_block(net_list, inp_list, geo_feat, disp, att)
+            disp = disp + delta_disp.float()
+            if test_mode and itr < iters-1:
+                continue
+            # upsample predictions
+            disp_up = self.upsample_disp(disp.float(), mask_feat_4.float(), stem_2x.float())
+            disp_preds.append(disp_up)
+        if test_mode:
+            return disp_up
+        return init_disp, disp_preds
+    def run_hierachical(self, image1, image2, iters=12, test_mode=False, low_memory=False, small_ratio=0.5):
+      B,_,H,W = image1.shape
+      img1_small = F.interpolate(image1, scale_factor=small_ratio, align_corners=False, mode='bilinear')
+      img2_small = F.interpolate(image2, scale_factor=small_ratio, align_corners=False, mode='bilinear')
+      padder = InputPadder(img1_small.shape[-2:], divis_by=32, force_square=False)
+      img1_small, img2_small = padder.pad(img1_small, img2_small)
+      disp_small = self.forward(img1_small, img2_small, test_mode=True, iters=iters, low_memory=low_memory)
+      disp_small = padder.unpad(disp_small.float())
+      disp_small_up = F.interpolate(disp_small, size=(H,W), mode='bilinear', align_corners=True) * 1/small_ratio
+      disp_small_up = disp_small_up.clip(0, None)
+      padder = InputPadder(image1.shape[-2:], divis_by=32, force_square=False)
+      image1, image2, disp_small_up = padder.pad(image1, image2, disp_small_up)
+      disp_small_up += padder._pad[0]
+      init_disp = F.interpolate(disp_small_up, scale_factor=0.25, mode='bilinear', align_corners=True) * 0.25   # Init disp will be 1/4
+      disp = self.forward(image1, image2, iters=iters, test_mode=test_mode, low_memory=low_memory, init_disp=init_disp)
+      disp = padder.unpad(disp.float())
+      return disp

FoundationStereo_demo/core/geometry.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch,pdb,os,sys
+import torch.nn.functional as F
+from core.utils.utils import bilinear_sampler
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(f'{code_dir}/../')
+from Utils import *
+class Combined_Geo_Encoding_Volume:
+    def __init__(self, init_fmap1, init_fmap2, geo_volume, num_levels=2, dx=None):
+        self.num_levels = num_levels
+        self.geo_volume_pyramid = []
+        self.init_corr_pyramid = []
+        self.dx = dx
+        # all pairs correlation
+        init_corr = Combined_Geo_Encoding_Volume.corr(init_fmap1, init_fmap2)
+        b, h, w, _, w2 = init_corr.shape
+        b, c, d, h, w = geo_volume.shape
+        geo_volume = geo_volume.permute(0, 3, 4, 1, 2).reshape(b*h*w, c, 1, d).contiguous()
+        init_corr = init_corr.reshape(b*h*w, 1, 1, w2)
+        self.geo_volume_pyramid.append(geo_volume)
+        self.init_corr_pyramid.append(init_corr)
+        for i in range(self.num_levels-1):
+            geo_volume = F.avg_pool2d(geo_volume, [1,2], stride=[1,2])
+            self.geo_volume_pyramid.append(geo_volume)
+        for i in range(self.num_levels-1):
+            init_corr = F.avg_pool2d(init_corr, [1,2], stride=[1,2])
+            self.init_corr_pyramid.append(init_corr)
+    def __call__(self, disp, coords, low_memory=False):
+        b, _, h, w = disp.shape
+        self.dx = self.dx.to(disp.device)
+        out_pyramid = []
+        for i in range(self.num_levels):
+            geo_volume = self.geo_volume_pyramid[i]
+            x0 = self.dx + disp.reshape(b*h*w, 1, 1, 1) / 2**i
+            y0 = torch.zeros_like(x0)
+            disp_lvl = torch.cat([x0,y0], dim=-1)
+            geo_volume = bilinear_sampler(geo_volume, disp_lvl, low_memory=low_memory)
+            geo_volume = geo_volume.reshape(b, h, w, -1)
+            init_corr = self.init_corr_pyramid[i]
+            init_x0 = coords.reshape(b*h*w, 1, 1, 1)/2**i - disp.reshape(b*h*w, 1, 1, 1) / 2**i + self.dx   # X on right image
+            init_coords_lvl = torch.cat([init_x0,y0], dim=-1)
+            init_corr = bilinear_sampler(init_corr, init_coords_lvl, low_memory=low_memory)
+            init_corr = init_corr.reshape(b, h, w, -1)
+            out_pyramid.append(geo_volume)
+            out_pyramid.append(init_corr)
+        out_pyramid = torch.cat(out_pyramid, dim=-1)
+        return out_pyramid.permute(0, 3, 1, 2).contiguous()   #(B,C,H,W)
+    @staticmethod
+    def corr(fmap1, fmap2):
+        B, D, H, W1 = fmap1.shape
+        _, _, _, W2 = fmap2.shape
+        fmap1 = fmap1.reshape(B, D, H, W1)
+        fmap2 = fmap2.reshape(B, D, H, W2)
+        with torch.cuda.amp.autocast(enabled=False):
+          corr = torch.einsum('aijk,aijh->ajkh', F.normalize(fmap1.float(), dim=1), F.normalize(fmap2.float(), dim=1))
+        corr = corr.reshape(B, H, W1, 1, W2)
+        return corr

FoundationStereo_demo/core/submodule.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch,pdb,os,sys
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(f'{code_dir}/../')
+from Utils import *
+def _is_contiguous(tensor: torch.Tensor) -> bool:
+    if torch.jit.is_scripting():
+        return tensor.is_contiguous()
+    else:
+        return tensor.is_contiguous(memory_format=torch.contiguous_format)
+class LayerNorm2d(nn.LayerNorm):
+    r""" https://huggingface.co/spaces/Roll20/pet_score/blob/b258ef28152ab0d5b377d9142a23346f863c1526/lib/timm/models/convnext.py#L85
+    LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W).
+    """
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__(normalized_shape, eps=eps)
+    def forward(self, x) -> torch.Tensor:
+        """
+        @x: (B,C,H,W)
+        """
+        if _is_contiguous(x):
+            return F.layer_norm(x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2).contiguous()
+        else:
+            s, u = torch.var_mean(x, dim=1, keepdim=True)
+            x = (x - u) * torch.rsqrt(s + self.eps)
+            x = x * self.weight[:, None, None] + self.bias[:, None, None]
+            return x
+class BasicConv(nn.Module):
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, bn=True, relu=True, norm='batch', **kwargs):
+        super(BasicConv, self).__init__()
+        self.relu = relu
+        self.use_bn = bn
+        self.bn = nn.Identity()
+        if is_3d:
+            if deconv:
+                self.conv = nn.ConvTranspose3d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv3d(in_channels, out_channels, bias=False, **kwargs)
+            if self.use_bn:
+              if norm=='batch':
+                self.bn = nn.BatchNorm3d(out_channels)
+              elif norm=='instance':
+                self.bn = nn.InstanceNorm3d(out_channels)
+        else:
+            if deconv:
+                self.conv = nn.ConvTranspose2d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+            if self.use_bn:
+              if norm=='batch':
+                self.bn = nn.BatchNorm2d(out_channels)
+              elif norm=='instance':
+                self.bn = nn.InstanceNorm2d(out_channels)
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.relu:
+            x = nn.LeakyReLU()(x)#, inplace=True)
+        return x
+class Conv3dNormActReduced(nn.Module):
+    def __init__(self, C_in, C_out, hidden=None, kernel_size=3, kernel_disp=None, stride=1, norm=nn.BatchNorm3d):
+        super().__init__()
+        if kernel_disp is None:
+          kernel_disp = kernel_size
+        if hidden is None:
+            hidden = C_out
+        self.conv1 = nn.Sequential(
+            nn.Conv3d(C_in, hidden, kernel_size=(1,kernel_size,kernel_size), padding=(0, kernel_size//2, kernel_size//2), stride=(1, stride, stride)),
+            norm(hidden),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv3d(hidden, C_out, kernel_size=(kernel_disp, 1, 1), padding=(kernel_disp//2, 0, 0), stride=(stride, 1, 1)),
+            norm(C_out),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        """
+        @x: (B,C,D,H,W)
+        """
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class ResnetBasicBlock(nn.Module):
+  def __init__(self, inplanes, planes, kernel_size=3, stride=1, padding=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=nn.BatchNorm2d, bias=False):
+    super().__init__()
+    self.norm_layer = norm_layer
+    if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+    if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+    # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+    self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=kernel_size, stride=stride, bias=bias, padding=padding)
+    if self.norm_layer is not None:
+      self.bn1 = norm_layer(planes)
+    self.relu = nn.ReLU(inplace=True)
+    self.conv2 = nn.Conv2d(planes, planes, kernel_size=kernel_size, stride=stride, bias=bias, padding=padding)
+    if self.norm_layer is not None:
+      self.bn2 = norm_layer(planes)
+    self.downsample = downsample
+    self.stride = stride
+  def forward(self, x):
+    identity = x
+    out = self.conv1(x)
+    if self.norm_layer is not None:
+      out = self.bn1(out)
+    out = self.relu(out)
+    out = self.conv2(out)
+    if self.norm_layer is not None:
+      out = self.bn2(out)
+    if self.downsample is not None:
+      identity = self.downsample(x)
+    out += identity
+    out = self.relu(out)
+    return out
+class ResnetBasicBlock3D(nn.Module):
+  def __init__(self, inplanes, planes, kernel_size=3, stride=1, padding=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=nn.BatchNorm3d, bias=False):
+    super().__init__()
+    self.norm_layer = norm_layer
+    if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+    if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+    self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=kernel_size, stride=stride, bias=bias, padding=padding)
+    if self.norm_layer is not None:
+      self.bn1 = norm_layer(planes)
+    self.relu = nn.ReLU(inplace=True)
+    self.conv2 = nn.Conv3d(planes, planes, kernel_size=kernel_size, stride=stride, bias=bias, padding=padding)
+    if self.norm_layer is not None:
+      self.bn2 = norm_layer(planes)
+    self.downsample = downsample
+    self.stride = stride
+  def forward(self, x):
+    identity = x
+    out = self.conv1(x)
+    if self.norm_layer is not None:
+      out = self.bn1(out)
+    out = self.relu(out)
+    out = self.conv2(out)
+    if self.norm_layer is not None:
+      out = self.bn2(out)
+    if self.downsample is not None:
+      identity = self.downsample(x)
+    out += identity
+    out = self.relu(out)
+    return out
+class FlashMultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+    def forward(self, query, key, value, attn_mask=None, window_size=(-1,-1)):
+        """
+        @query: (B,L,C)
+        """
+        B,L,C = query.shape
+        Q = self.q_proj(query)
+        K = self.k_proj(key)
+        V = self.v_proj(value)
+        Q = Q.view(Q.size(0), Q.size(1), self.num_heads, self.head_dim)
+        K = K.view(K.size(0), K.size(1), self.num_heads, self.head_dim)
+        V = V.view(V.size(0), V.size(1), self.num_heads, self.head_dim)
+        attn_output = F.scaled_dot_product_attention(Q, K, V)
+        attn_output = attn_output.reshape(B,L,-1)
+        output = self.out_proj(attn_output)
+        return output
+class FlashAttentionTransformerEncoderLayer(nn.Module):
+    def __init__(self, embed_dim, num_heads, dim_feedforward, dropout=0.1, act=nn.GELU, norm=nn.LayerNorm):
+        super().__init__()
+        self.self_attn = FlashMultiheadAttention(embed_dim, num_heads)
+        self.act = act()
+        self.linear1 = nn.Linear(embed_dim, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, embed_dim)
+        self.norm1 = norm(embed_dim)
+        self.norm2 = norm(embed_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, src, src_mask=None, window_size=(-1, -1)):
+        src2 = self.self_attn(src, src, src, src_mask, window_size=window_size)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.act(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+class UpsampleConv(nn.Module):
+    def __init__(self, C_in, C_out, is_3d=False, kernel_size=3, bias=True, stride=1, padding=1):
+        super().__init__()
+        self.is_3d = is_3d
+        if is_3d:
+          self.conv = nn.Conv3d(C_in, C_out, kernel_size=kernel_size, stride=1, padding=kernel_size//2, bias=bias)
+        else:
+          self.conv = nn.Conv2d(C_in, C_out, kernel_size=kernel_size, stride=1, padding=kernel_size//2, bias=bias)
+    def forward(self, x):
+        if self.is_3d:
+          mode = 'trilinear'
+        else:
+          mode = 'bilinear'
+        x = F.interpolate(x, size=None, scale_factor=2, align_corners=False, mode=mode)
+        x = self.conv(x)
+        return x
+class Conv2x(nn.Module):
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, concat=True, keep_concat=True, bn=True, relu=True, keep_dispc=False):
+        super(Conv2x, self).__init__()
+        self.concat = concat
+        self.is_3d = is_3d
+        if deconv and is_3d:
+            kernel = (4, 4, 4)
+        elif deconv:
+            kernel = 4
+        else:
+            kernel = 3
+        if deconv and is_3d and keep_dispc:
+            kernel = (1, 4, 4)
+            stride = (1, 2, 2)
+            padding = (0, 1, 1)
+            self.conv1 = BasicConv(in_channels, out_channels, deconv, is_3d, bn=bn, relu=True, kernel_size=kernel, stride=stride, padding=padding)
+        else:
+            self.conv1 = BasicConv(in_channels, out_channels, deconv, is_3d, bn=bn, relu=True, kernel_size=kernel, stride=2, padding=1)
+        if self.concat:
+            mul = 2 if keep_concat else 1
+            self.conv2 = BasicConv(out_channels*2, out_channels*mul, False, is_3d, bn, relu, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv2 = BasicConv(out_channels, out_channels, False, is_3d, bn, relu, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, rem):
+        x = self.conv1(x)
+        if x.shape != rem.shape:
+            x = F.interpolate(x, size=(rem.shape[-2], rem.shape[-1]), mode='bilinear')
+        if self.concat:
+            x = torch.cat((x, rem), 1)
+        else:
+            x = x + rem
+        x = self.conv2(x)
+        return x
+class BasicConv_IN(nn.Module):
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, IN=True, relu=True, **kwargs):
+        super(BasicConv_IN, self).__init__()
+        self.relu = relu
+        self.use_in = IN
+        if is_3d:
+            if deconv:
+                self.conv = nn.ConvTranspose3d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv3d(in_channels, out_channels, bias=False, **kwargs)
+            self.IN = nn.InstanceNorm3d(out_channels)
+        else:
+            if deconv:
+                self.conv = nn.ConvTranspose2d(in_channels, out_channels, bias=False, **kwargs)
+            else:
+                self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+            self.IN = nn.InstanceNorm2d(out_channels)
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_in:
+            x = self.IN(x)
+        if self.relu:
+            x = nn.LeakyReLU()(x)#, inplace=True)
+        return x
+class Conv2x_IN(nn.Module):
+    def __init__(self, in_channels, out_channels, deconv=False, is_3d=False, concat=True, keep_concat=True, IN=True, relu=True, keep_dispc=False):
+        super(Conv2x_IN, self).__init__()
+        self.concat = concat
+        self.is_3d = is_3d
+        if deconv and is_3d:
+            kernel = (4, 4, 4)
+        elif deconv:
+            kernel = 4
+        else:
+            kernel = 3
+        if deconv and is_3d and keep_dispc:
+            kernel = (1, 4, 4)
+            stride = (1, 2, 2)
+            padding = (0, 1, 1)
+            self.conv1 = BasicConv_IN(in_channels, out_channels, deconv, is_3d, IN=True, relu=True, kernel_size=kernel, stride=stride, padding=padding)
+        else:
+            self.conv1 = BasicConv_IN(in_channels, out_channels, deconv, is_3d, IN=True, relu=True, kernel_size=kernel, stride=2, padding=1)
+        if self.concat:
+            mul = 2 if keep_concat else 1
+            self.conv2 = ResnetBasicBlock(out_channels*2, out_channels*mul, kernel_size=3, stride=1, padding=1, norm_layer=nn.InstanceNorm2d)
+        else:
+            self.conv2 = BasicConv_IN(out_channels, out_channels, False, is_3d, IN, relu, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, rem):
+        x = self.conv1(x)
+        if x.shape != rem.shape:
+            x = F.interpolate(x, size=(rem.shape[-2], rem.shape[-1]), mode='bilinear')
+        if self.concat:
+            x = torch.cat((x, rem), 1)
+        else:
+            x = x + rem
+        x = self.conv2(x)
+        return x
+def groupwise_correlation(fea1, fea2, num_groups):
+    B, C, H, W = fea1.shape
+    assert C % num_groups == 0, f"C:{C}, num_groups:{num_groups}"
+    channels_per_group = C // num_groups
+    fea1 = fea1.reshape(B, num_groups, channels_per_group, H, W)
+    fea2 = fea2.reshape(B, num_groups, channels_per_group, H, W)
+    with torch.cuda.amp.autocast(enabled=False):
+      cost = (F.normalize(fea1.float(), dim=2) * F.normalize(fea2.float(), dim=2)).sum(dim=2)  #!NOTE Divide first for numerical stability
+    assert cost.shape == (B, num_groups, H, W)
+    return cost
+def build_gwc_volume(refimg_fea, targetimg_fea, maxdisp, num_groups, stride=1):
+    """
+    @refimg_fea: left image feature
+    @targetimg_fea: right image feature
+    """
+    B, C, H, W = refimg_fea.shape
+    volume = refimg_fea.new_zeros([B, num_groups, maxdisp, H, W])
+    for i in range(maxdisp):
+        if i > 0:
+            volume[:, :, i, :, i:] = groupwise_correlation(refimg_fea[:, :, :, i:], targetimg_fea[:, :, :, :-i], num_groups)
+        else:
+            volume[:, :, i, :, :] = groupwise_correlation(refimg_fea, targetimg_fea, num_groups)
+    volume = volume.contiguous()
+    return volume
+def build_concat_volume(refimg_fea, targetimg_fea, maxdisp):
+    B, C, H, W = refimg_fea.shape
+    volume = refimg_fea.new_zeros([B, 2 * C, maxdisp, H, W])
+    for i in range(maxdisp):
+        if i > 0:
+            volume[:, :C, i, :, :] = refimg_fea[:, :, :, :]
+            volume[:, C:, i, :, i:] = targetimg_fea[:, :, :, :-i]
+        else:
+            volume[:, :C, i, :, :] = refimg_fea
+            volume[:, C:, i, :, :] = targetimg_fea
+    volume = volume.contiguous()
+    return volume
+def disparity_regression(x, maxdisp):
+    assert len(x.shape) == 4
+    disp_values = torch.arange(0, maxdisp, dtype=x.dtype, device=x.device)
+    disp_values = disp_values.reshape(1, maxdisp, 1, 1)
+    return torch.sum(x * disp_values, 1, keepdim=True)
+class FeatureAtt(nn.Module):
+    def __init__(self, cv_chan, feat_chan):
+        super(FeatureAtt, self).__init__()
+        self.feat_att = nn.Sequential(
+            BasicConv(feat_chan, feat_chan//2, kernel_size=1, stride=1, padding=0),
+            nn.Conv2d(feat_chan//2, cv_chan, 1)
+            )
+    def forward(self, cv, feat):
+        '''
+        @cv: cost volume (B,C,D,H,W)
+        @feat: (B,C,H,W)
+        '''
+        feat_att = self.feat_att(feat).unsqueeze(2)   #(B,C,1,H,W)
+        cv = torch.sigmoid(feat_att)*cv
+        return cv
+def context_upsample(disp_low, up_weights):
+    """
+    @disp_low: (b,1,h,w)  1/4 resolution
+    @up_weights: (b,9,4*h,4*w)  Image resolution
+    """
+    b, c, h, w = disp_low.shape
+    disp_unfold = F.unfold(disp_low.reshape(b,c,h,w),3,1,1).reshape(b,-1,h,w)
+    disp_unfold = F.interpolate(disp_unfold,(h*4,w*4),mode='nearest').reshape(b,9,h*4,w*4)
+    disp = (disp_unfold*up_weights).sum(1)
+    return disp
+class PositionalEmbedding(nn.Module):
+  def __init__(self, d_model, max_len=512):
+    super().__init__()
+    # Compute the positional encodings once in log space.
+    pe = torch.zeros(max_len, d_model).float()
+    pe.require_grad = False
+    position = torch.arange(0, max_len).float().unsqueeze(1)  #(N,1)
+    div_term = (torch.arange(0, d_model, 2).float() * -(np.log(10000.0) / d_model)).exp()[None]
+    pe[:, 0::2] = torch.sin(position * div_term)  #(N, d_model/2)
+    pe[:, 1::2] = torch.cos(position * div_term)
+    pe = pe.unsqueeze(0)
+    self.pe = pe
+    # self.register_buffer('pe', pe)  #(1, max_len, D)
+  def forward(self, x, resize_embed=False):
+    '''
+    @x: (B,N,D)
+    '''
+    self.pe = self.pe.to(x.device).to(x.dtype)
+    pe = self.pe
+    if pe.shape[1]<x.shape[1]:
+      if resize_embed:
+        pe = F.interpolate(pe.permute(0,2,1), size=x.shape[1], mode='linear', align_corners=False).permute(0,2,1)
+      else:
+        raise RuntimeError(f'x:{x.shape}, pe:{pe.shape}')
+    return x + pe[:, :x.size(1)]
+class CostVolumeDisparityAttention(nn.Module):
+  def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, act=nn.GELU, norm_first=False, num_transformer=6, max_len=512, resize_embed=False):
+    super().__init__()
+    self.resize_embed = resize_embed
+    self.sa = nn.ModuleList([])
+    for _ in range(num_transformer):
+      self.sa.append(FlashAttentionTransformerEncoderLayer(embed_dim=d_model, num_heads=nhead, dim_feedforward=dim_feedforward, act=act, dropout=dropout))
+    self.pos_embed0 = PositionalEmbedding(d_model, max_len=max_len)
+  def forward(self, cv, window_size=(-1,-1)):
+    """
+    @cv: (B,C,D,H,W) where D is max disparity
+    """
+    x = cv
+    B,C,D,H,W = x.shape
+    x = x.permute(0,3,4,2,1).reshape(B*H*W, D, C)
+    x = self.pos_embed0(x, resize_embed=self.resize_embed)  #!NOTE No resize since disparity is pre-determined
+    for i in range(len(self.sa)):
+        x = self.sa[i](x, window_size=window_size)
+    x = x.reshape(B,H,W,D,C).permute(0,4,3,1,2)
+    return x
+class ChannelAttentionEnhancement(nn.Module):
+    def __init__(self, in_planes, ratio=16):
+        super(ChannelAttentionEnhancement, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.fc = nn.Sequential(nn.Conv2d(in_planes, in_planes // 16, 1, bias=False),
+                               nn.ReLU(),
+                               nn.Conv2d(in_planes // 16, in_planes, 1, bias=False))
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        avg_out = self.fc(self.avg_pool(x))
+        max_out = self.fc(self.max_pool(x))
+        out = avg_out + max_out
+        return self.sigmoid(out)
+class SpatialAttentionExtractor(nn.Module):
+    def __init__(self, kernel_size=7):
+        super(SpatialAttentionExtractor, self).__init__()
+        self.samconv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out, _ = torch.max(x, dim=1, keepdim=True)
+        x = torch.cat([avg_out, max_out], dim=1)
+        x = self.samconv(x)
+        return self.sigmoid(x)
+class EdgeNextConvEncoder(nn.Module):
+    def __init__(self, dim, layer_scale_init_value=1e-6, expan_ratio=4, kernel_size=7, norm='layer'):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=dim)
+        if norm=='layer':
+          self.norm = LayerNorm2d(dim, eps=1e-6)
+        else:
+          self.norm = nn.Identity()
+        self.pwconv1 = nn.Linear(dim, expan_ratio * dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(expan_ratio * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) if layer_scale_init_value > 0 else None
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = self.norm(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + x
+        return x

FoundationStereo_demo/core/update.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch,pdb,os,sys
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch import einsum
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(f'{code_dir}/../')
+from core.submodule import *
+from core.extractor import *
+class DispHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256, output_dim=1):
+        super(DispHead, self).__init__()
+        self.conv = nn.Sequential(
+          nn.Conv2d(input_dim, input_dim, kernel_size=3, padding=1),
+          nn.ReLU(),
+          EdgeNextConvEncoder(input_dim, expan_ratio=4, kernel_size=7, norm=None),
+          EdgeNextConvEncoder(input_dim, expan_ratio=4, kernel_size=7, norm=None),
+          nn.Conv2d(input_dim, output_dim, 3, padding=1),
+        )
+    def forward(self, x):
+        return self.conv(x)
+class ConvGRU(nn.Module):
+    def __init__(self, hidden_dim, input_dim, kernel_size=3):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+        self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+        self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2)
+    def forward(self, h, cz, cr, cq, *x_list):
+        x = torch.cat(x_list, dim=1)
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz(hx) + cz)
+        r = torch.sigmoid(self.convr(hx) + cr)
+        q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)) + cq)
+        h = (1-z) * h + z * q
+        return h
+class BasicMotionEncoder(nn.Module):
+    def __init__(self, args, ngroup=8):
+        super(BasicMotionEncoder, self).__init__()
+        self.args = args
+        cor_planes = args.corr_levels * (2*args.corr_radius + 1) * (ngroup+1)
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 256, 3, padding=1)
+        self.convd1 = nn.Conv2d(1, 64, 7, padding=3)
+        self.convd2 = nn.Conv2d(64, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64+256, 128-1, 3, padding=1)
+    def forward(self, disp, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        disp_ = F.relu(self.convd1(disp))
+        disp_ = F.relu(self.convd2(disp_))
+        cor_disp = torch.cat([cor, disp_], dim=1)
+        out = F.relu(self.conv(cor_disp))
+        return torch.cat([out, disp], dim=1)
+def pool2x(x):
+    return F.avg_pool2d(x, 3, stride=2, padding=1)
+def pool4x(x):
+    return F.avg_pool2d(x, 5, stride=4, padding=1)
+def interp(x, dest):
+    interp_args = {'mode': 'bilinear', 'align_corners': True}
+    return F.interpolate(x, dest.shape[2:], **interp_args)
+class RaftConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=256, kernel_size=3):
+        super().__init__()
+        self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size // 2)
+        self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size // 2)
+        self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size // 2)
+    def forward(self, h, x, hx):
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)))
+        h = (1-z) * h + z * q
+        return h
+class SelectiveConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=256, small_kernel_size=1, large_kernel_size=3, patch_size=None):
+        super(SelectiveConvGRU, self).__init__()
+        self.conv0 = nn.Sequential(
+            nn.Conv2d(input_dim, input_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
+        )
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_dim+hidden_dim, input_dim+hidden_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
+        )
+        self.small_gru = RaftConvGRU(hidden_dim, input_dim, small_kernel_size)
+        self.large_gru = RaftConvGRU(hidden_dim, input_dim, large_kernel_size)
+    def forward(self, att, h, *x):
+        x = torch.cat(x, dim=1)
+        x = self.conv0(x)
+        hx = torch.cat([x, h], dim=1)
+        hx = self.conv1(hx)
+        h = self.small_gru(h, x, hx) * att + self.large_gru(h, x, hx) * (1 - att)
+        return h
+class BasicSelectiveMultiUpdateBlock(nn.Module):
+    def __init__(self, args, hidden_dim=128, volume_dim=8):
+        super().__init__()
+        self.args = args
+        self.encoder = BasicMotionEncoder(args, volume_dim)
+        if args.n_gru_layers == 3:
+            self.gru16 = SelectiveConvGRU(hidden_dim, hidden_dim * 2)
+        if args.n_gru_layers >= 2:
+            self.gru08 = SelectiveConvGRU(hidden_dim, hidden_dim * (args.n_gru_layers == 3) + hidden_dim * 2)
+        self.gru04 = SelectiveConvGRU(hidden_dim, hidden_dim * (args.n_gru_layers > 1) + hidden_dim * 2)
+        self.disp_head = DispHead(hidden_dim, 256)
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 64, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 32, 3, padding=1),
+            nn.ReLU(inplace=True),
+            )
+    def forward(self, net, inp, corr, disp, att):
+        if self.args.n_gru_layers == 3:
+            net[2] = self.gru16(att[2], net[2], inp[2], pool2x(net[1]))
+        if self.args.n_gru_layers >= 2:
+            if self.args.n_gru_layers > 2:
+                net[1] = self.gru08(att[1], net[1], inp[1], pool2x(net[0]), interp(net[2], net[1]))
+            else:
+                net[1] = self.gru08(att[1], net[1], inp[1], pool2x(net[0]))
+        motion_features = self.encoder(disp, corr)
+        motion_features = torch.cat([inp[0], motion_features], dim=1)
+        if self.args.n_gru_layers > 1:
+            net[0] = self.gru04(att[0], net[0], motion_features, interp(net[1], net[0]))
+        delta_disp = self.disp_head(net[0])
+        # scale mask to balence gradients
+        mask = .25 * self.mask(net[0])
+        return net, mask, delta_disp

FoundationStereo_demo/core/utils/utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch,pdb,logging
+import torch.nn.functional as F
+import numpy as np
+from scipy import interpolate
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+    def __init__(self, dims, mode='sintel', divis_by=8, force_square=False):
+        self.ht, self.wd = dims[-2:]
+        if force_square:
+          max_side = max(self.ht, self.wd)
+          pad_ht = ((max_side // divis_by) + 1) * divis_by - self.ht
+          pad_wd = ((max_side // divis_by) + 1) * divis_by - self.wd
+        else:
+          pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by
+          pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by
+        if mode == 'sintel':
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+        else:
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
+    def pad(self, *inputs):
+        assert all((x.ndim == 4) for x in inputs)
+        # Ensure padded tensors are contiguous to avoid cuDNN issues
+        return [F.pad(x, self._pad, mode='replicate').contiguous() for x in inputs]
+    def unpad(self, x):
+        assert x.ndim == 4
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        # Ensure unpadded tensor is contiguous
+        return x[..., c[0]:c[1], c[2]:c[3]].contiguous()
+def bilinear_sampler(img, coords, mode='bilinear', mask=False, low_memory=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1,1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1   # Normalize to [-1,1]
+    assert torch.unique(ygrid).numel() == 1 and H == 1 # This is a stereo problem
+    grid = torch.cat([xgrid, ygrid], dim=-1).to(img.dtype).contiguous()
+    img = F.grid_sample(img, grid, align_corners=True).contiguous()
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float().contiguous()
+    return img
+def coords_grid(batch, ht, wd):
+    coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)

FoundationStereo_demo/depth_anything/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

FoundationStereo_demo/depth_anything/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # depth_anything package
2	+ # This file allows depth_anything to be imported as a package

FoundationStereo_demo/depth_anything/blocks.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape*2
+        out_shape3 = out_shape*4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape*8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        if self.bn==True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn==True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn==True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features//2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size=size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(
+            output, **modifier, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

FoundationStereo_demo/depth_anything/dpt.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import argparse
+import torch,os,sys,pdb
+import torch.nn as nn
+import torch.nn.functional as F
+code_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(f'{code_dir}/../')
+from depth_anything.blocks import FeatureFusionBlock, _make_scratch
+def _make_fusion_block(features, use_bn, size = None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class DPTHead(nn.Module):
+    def __init__(self, nclass, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False):
+        super(DPTHead, self).__init__()
+        self.nclass = nclass
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        if nclass > 1:
+            self.scratch.output_conv = nn.Sequential(
+                nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(head_features_1, nclass, kernel_size=1, stride=1, padding=0),
+            )
+        else:
+            self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+            self.scratch.output_conv2 = nn.Sequential(
+                nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(True),
+                nn.Identity(),
+            )
+    def forward(self, out_features, patch_h, patch_w, return_intermediate=False, patch_size=14):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * patch_size), int(patch_w * patch_size)), mode="bilinear", align_corners=True)
+        if return_intermediate:
+          depth = self.scratch.output_conv2(out)
+          depth = F.relu(depth)
+          disp = 1/depth
+          disp[depth==0] = 0
+          disp = disp/disp.max()
+          return out, path_1, path_2, path_3, path_4, disp
+        else:
+          out = self.scratch.output_conv2(out)
+          return out
+class DPT_DINOv2(nn.Module):
+    def __init__(self, encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024], use_bn=False, use_clstoken=False, pretrained_dino=False):
+        super(DPT_DINOv2, self).__init__()
+        assert encoder in ['vits', 'vitb', 'vitl']
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        # if localhub:
+        #     self.pretrained = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_{:}14'.format(encoder), source='local', pretrained=False)
+        # else:
+        self.pretrained = torch.hub.load('facebookresearch/dinov2', 'dinov2_{:}14'.format(encoder), pretrained=pretrained_dino)
+        dim = self.pretrained.blocks[0].attn.qkv.in_features
+        self.depth_head = DPTHead(1, dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True)
+        patch_size = self.pretrained.patch_size
+        patch_h, patch_w = h // patch_size, w // patch_size
+        output = self.depth_head(features, patch_h, patch_w, patch_size=patch_size, return_intermediate=True)
+        return output
+class DepthAnything(DPT_DINOv2):
+    def __init__(self, config):
+        super().__init__(**config)
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True)
+        patch_size = self.pretrained.patch_size
+        patch_h, patch_w = h // patch_size, w // patch_size
+        depth = self.depth_head(features, patch_h, patch_w, patch_size=patch_size)
+        depth = F.interpolate(depth, size=(h, w), mode="bilinear", align_corners=True)
+        depth = F.relu(depth)
+        return depth.squeeze(1)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--encoder",
+        default="vits",
+        type=str,
+        choices=["vits", "vitb", "vitl"],
+    )
+    args = parser.parse_args()
+    model = DepthAnything.from_pretrained("LiheYoung/depth_anything_{:}14".format(args.encoder))
+    print(model)

FoundationStereo_demo/depth_anything/util/transform.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import random
+from PIL import Image, ImageOps, ImageFilter
+import torch
+from torchvision import transforms
+import torch.nn.functional as F
+import numpy as np
+import cv2
+import math
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+            if "semseg_mask" in sample:
+                # sample["semseg_mask"] = cv2.resize(
+                #     sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
+                # )
+                sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode='nearest').numpy()[0, 0]
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(
+                    sample["mask"].astype(np.float32),
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+                # sample["mask"] = sample["mask"].astype(bool)
+        # print(sample['image'].shape, sample['depth'].shape)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "semseg_mask" in sample:
+            sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
+            sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
+        return sample

assets/example1/K.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 754.6680908203125 0.0 489.3794860839844 0.0 754.6680908203125 265.16162109375 0.0 0.0 1.0
2	+ 0.063

assets/example1/left.png ADDED Viewed

Git LFS Details

SHA256: f080f84d7b2e28ba110eea80f0504cad9390f2399d6ffde6cd1e668200c3ef48
Pointer size: 131 Bytes
Size of remote file: 719 kB

assets/example1/right.png ADDED Viewed

Git LFS Details

SHA256: 337991aa0b35417ae64d6f66819522c233a4455eacfd34c9dc114ad569ec50f4
Pointer size: 131 Bytes
Size of remote file: 720 kB

assets/example2/K.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+cam0=[1733.74 0 792.27; 0 1733.74 541.89; 0 0 1]
+cam1=[1733.74 0 792.27; 0 1733.74 541.89; 0 0 1]
+doffs=0
+baseline=536.62
+width=1920
+height=1080
+ndisp=170
+vmin=55
+vmax=142

assets/example2/left.png ADDED Viewed

Git LFS Details

SHA256: 280be6eac4b525eee6d49f0afd32c11ef0b83d2cad3e77e946fe525fda16a355
Pointer size: 132 Bytes
Size of remote file: 2.76 MB

assets/example2/right.png ADDED Viewed

Git LFS Details

SHA256: 97be2568394bae63e26bf62343bfd04adfb372c9c96710784745bd7130a0c7d8
Pointer size: 132 Bytes
Size of remote file: 2.76 MB