Spaces:

Gamahea
/

lemm-test-100

Running on Zero

App Files Files Community

Gamahea commited on 8 days ago

Commit

aad9d66

1 Parent(s): 7d5476d

Deploy Music Generation Studio - 2025-12-12 16:01

Browse files

Files changed (32) hide show

.gitignore +9 -0
README.md +68 -8
app.py +487 -0
backend/__init__.py +1 -0
backend/app.py +80 -0
backend/config/__init__.py +4 -0
backend/config/settings.py +50 -0
backend/routes/__init__.py +6 -0
backend/routes/export.py +124 -0
backend/routes/generation.py +191 -0
backend/routes/mastering.py +185 -0
backend/routes/timeline.py +137 -0
backend/run.py +50 -0
backend/services/__init__.py +12 -0
backend/services/diffrhythm_service.py +397 -0
backend/services/export_service.py +123 -0
backend/services/fish_speech_service.py +148 -0
backend/services/lyricmind_service.py +220 -0
backend/services/mastering_service.py +641 -0
backend/services/style_consistency_service.py +340 -0
backend/services/timeline_service.py +186 -0
backend/start_with_env.py +21 -0
backend/utils/__init__.py +5 -0
backend/utils/amd_gpu.py +96 -0
backend/utils/logger.py +57 -0
backend/utils/prompt_analyzer.py +291 -0
backend/utils/validators.py +64 -0
hf_config.py +30 -0
packages.txt +3 -0
pre_startup.sh +42 -0
requirements.txt +46 -0
setup_diffrhythm2_src.sh +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+*.pyo
+.Python
+*.log
+models/
+outputs/
+logs/
+.env

README.md CHANGED Viewed

@@ -1,14 +1,74 @@
 ---
-title: Lemm Test 100
-emoji: 👁
-colorFrom: blue
-colorTo: green
 sdk: gradio
-sdk_version: 6.1.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: Testing new LEMM version with new pipeline and models
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Music Generation Studio
+emoji: 🎵
+colorFrom: purple
+colorTo: pink
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: mit
+python_version: 3.11
 ---
+# 🎵 Music Generation Studio
+Create AI-powered music with intelligent prompt analysis and context-aware generation using DiffRhythm2 and LyricMind AI.
+## Features
+- **Intelligent Music Generation**: DiffRhythm2 model for high-quality music with vocals
+- **Smart Lyrics Generation**: LyricMind AI for context-aware lyric creation
+- **Prompt Analysis**: Automatically detects genre, BPM, and mood from your description
+- **Flexible Vocal Modes**:
+  - Instrumental: Pure music without vocals
+  - User Lyrics: Provide your own lyrics
+  - Auto Lyrics: AI-generated lyrics based on prompt
+- **Timeline Management**: Build complete songs clip-by-clip
+- **Export**: Download your creations in WAV, MP3, or FLAC formats
+## How to Use
+1. **Generate Music**:
+   - Enter a descriptive prompt (e.g., "energetic rock song with electric guitar at 140 BPM")
+   - Choose vocal mode (Instrumental, User Lyrics, or Auto Lyrics)
+   - Set duration (10-120 seconds)
+   - Click "Generate Music Clip"
+2. **Manage Timeline**:
+   - View all generated clips in the timeline
+   - Remove specific clips or clear all
+   - Clips are arranged sequentially
+3. **Export**:
+   - Enter a filename
+   - Choose format (WAV recommended for best quality)
+   - Download your complete song
+## Models
+- **DiffRhythm2**: Music generation with integrated vocals ([ASLP-lab/DiffRhythm2](https://huggingface.co/ASLP-lab/DiffRhythm2))
+- **MuQ-MuLan**: Music style encoding ([OpenMuQ/MuQ-MuLan-large](https://huggingface.co/OpenMuQ/MuQ-MuLan-large))
+## Performance
+⏱️ Generation time: ~2-4 minutes per 30-second clip on CPU (HuggingFace Spaces free tier)
+💡 Tip: Start with shorter durations (10-20 seconds) for faster results
+## Technical Details
+- Built with Gradio and PyTorch
+- Uses DiffRhythm2 for music generation with vocals
+- Employs flow-matching techniques for high-quality audio synthesis
+- Supports multiple languages for lyrics (English, Chinese, Japanese)
+## Credits
+- DiffRhythm2 by ASLP-lab
+- MuQ-MuLan by OpenMuQ
+- Application interface and integration by Music Generation App Team
+## License
+MIT License - See LICENSE file for details

app.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""
+Music Generation Studio - HuggingFace Spaces Deployment
+Main application file for Gradio interface
+"""
+import os
+import sys
+import gradio as gr
+import logging
+from pathlib import Path
+import shutil
+import subprocess
+# Run DiffRhythm2 source setup if needed
+setup_script = Path(__file__).parent / "setup_diffrhythm2_src.sh"
+if setup_script.exists():
+    try:
+        subprocess.run(["bash", str(setup_script)], check=True)
+    except Exception as e:
+        print(f"Warning: Failed to run setup script: {e}")
+# Configure environment for HuggingFace Spaces (espeak-ng paths, etc.)
+import hf_config
+# Setup paths for HuggingFace Spaces
+SPACE_DIR = Path(__file__).parent
+sys.path.insert(0, str(SPACE_DIR / 'backend'))
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Import services
+try:
+    from services.diffrhythm_service import DiffRhythmService
+    from services.lyricmind_service import LyricMindService
+    from services.timeline_service import TimelineService
+    from services.export_service import ExportService
+    from config.settings import Config
+    from utils.prompt_analyzer import PromptAnalyzer
+except ImportError as e:
+    logger.error(f"Import error: {e}")
+    raise
+# Initialize configuration
+config = Config()
+# Create necessary directories
+os.makedirs("outputs", exist_ok=True)
+os.makedirs("outputs/music", exist_ok=True)
+os.makedirs("outputs/mixed", exist_ok=True)
+os.makedirs("models", exist_ok=True)
+os.makedirs("logs", exist_ok=True)
+# Initialize services
+timeline_service = TimelineService()
+export_service = ExportService()
+# Lazy-load AI services (heavy models)
+diffrhythm_service = None
+lyricmind_service = None
+def get_diffrhythm_service():
+    """Lazy load DiffRhythm service"""
+    global diffrhythm_service
+    if diffrhythm_service is None:
+        logger.info("Loading DiffRhythm2 model...")
+        diffrhythm_service = DiffRhythmService(model_path=config.DIFFRHYTHM_MODEL_PATH)
+        logger.info("DiffRhythm2 model loaded")
+    return diffrhythm_service
+def get_lyricmind_service():
+    """Lazy load LyricMind service"""
+    global lyricmind_service
+    if lyricmind_service is None:
+        logger.info("Loading LyricMind model...")
+        lyricmind_service = LyricMindService(model_path=config.LYRICMIND_MODEL_PATH)
+        logger.info("LyricMind model loaded")
+    return lyricmind_service
+def generate_lyrics(prompt: str, duration: int, progress=gr.Progress()):
+    """Generate lyrics from prompt using analysis"""
+    try:
+        if not prompt or not prompt.strip():
+            return "❌ Please enter a prompt"
+        progress(0, desc="🔍 Analyzing prompt...")
+        logger.info(f"Generating lyrics for: {prompt}")
+        # Analyze prompt
+        analysis = PromptAnalyzer.analyze(prompt)
+        genre = analysis.get('genres', ['general'])[0] if analysis.get('genres') else 'general'
+        mood = analysis.get('mood', 'unknown')
+        logger.info(f"Analysis - Genre: {genre}, Mood: {mood}")
+        progress(0.3, desc=f"✍️ Generating {genre} lyrics...")
+        service = get_lyricmind_service()
+        lyrics = service.generate(
+            prompt=prompt,
+            duration=duration,
+            prompt_analysis=analysis
+        )
+        progress(1.0, desc="✅ Lyrics generated!")
+        return lyrics
+    except Exception as e:
+        logger.error(f"Error generating lyrics: {e}", exc_info=True)
+        return f"❌ Error: {str(e)}"
+def generate_music(prompt: str, lyrics: str, lyrics_mode: str, duration: int, position: str, progress=gr.Progress()):
+    """Generate music clip and add to timeline"""
+    try:
+        if not prompt or not prompt.strip():
+            return "❌ Please enter a music prompt", get_timeline_display(), None
+        # Estimate time (CPU on HF Spaces)
+        est_time = int(duration * 4)  # Conservative estimate for CPU
+        progress(0, desc=f"🔍 Analyzing prompt... (Est. {est_time}s)")
+        logger.info(f"Generating music: {prompt}, mode={lyrics_mode}, duration={duration}s")
+        # Analyze prompt
+        analysis = PromptAnalyzer.analyze(prompt)
+        genre = analysis.get('genres', ['general'])[0] if analysis.get('genres') else 'general'
+        bpm = analysis.get('bpm', 120)
+        mood = analysis.get('mood', 'neutral')
+        logger.info(f"Analysis - Genre: {genre}, BPM: {bpm}, Mood: {mood}")
+        # Determine lyrics based on mode
+        lyrics_to_use = None
+        if lyrics_mode == "Instrumental":
+            logger.info("Generating instrumental (no vocals)")
+            progress(0.1, desc=f"🎹 Preparing instrumental generation... ({est_time}s)")
+        elif lyrics_mode == "User Lyrics":
+            if not lyrics or not lyrics.strip():
+                return "❌ Please enter lyrics or switch mode", get_timeline_display(), None
+            lyrics_to_use = lyrics.strip()
+            logger.info("Using user-provided lyrics")
+            progress(0.1, desc=f"🎤 Preparing vocal generation... ({est_time}s)")
+        elif lyrics_mode == "Auto Lyrics":
+            if lyrics and lyrics.strip():
+                lyrics_to_use = lyrics.strip()
+                logger.info("Using existing lyrics from textbox")
+                progress(0.1, desc=f"🎤 Using provided lyrics... ({est_time}s)")
+            else:
+                progress(0.1, desc="✍️ Generating lyrics...")
+                logger.info("Auto-generating lyrics...")
+                lyric_service = get_lyricmind_service()
+                lyrics_to_use = lyric_service.generate(
+                    prompt=prompt,
+                    duration=duration,
+                    prompt_analysis=analysis
+                )
+                logger.info(f"Generated {len(lyrics_to_use)} characters of lyrics")
+                progress(0.25, desc=f"🎵 Lyrics ready, generating music... ({est_time}s)")
+        # Generate music
+        progress(0.3, desc=f"🎼 Generating {genre} at {bpm} BPM... ({est_time}s)")
+        service = get_diffrhythm_service()
+        final_path = service.generate(
+            prompt=prompt,
+            duration=duration,
+            lyrics=lyrics_to_use
+        )
+        # Add to timeline
+        progress(0.9, desc="📊 Adding to timeline...")
+        clip_id = os.path.basename(final_path).split('.')[0]
+        from models.schemas import ClipPosition
+        clip_info = timeline_service.add_clip(
+            clip_id=clip_id,
+            file_path=final_path,
+            duration=float(duration),
+            position=ClipPosition(position)
+        )
+        logger.info(f"Music added to timeline at position {clip_info['timeline_position']}")
+        # Build status message
+        progress(1.0, desc="✅ Complete!")
+        status_msg = f"✅ Music generated successfully!\n"
+        status_msg += f"🎸 Genre: {genre} | 🥁 BPM: {bpm} | 🎭 Mood: {mood}\n"
+        status_msg += f"🎤 Mode: {lyrics_mode} | 📍 Position: {position}\n"
+        if lyrics_mode == "Auto Lyrics" and lyrics_to_use and not lyrics:
+            status_msg += "✍️ (Lyrics auto-generated)"
+        return status_msg, get_timeline_display(), final_path
+    except Exception as e:
+        logger.error(f"Error generating music: {e}", exc_info=True)
+        return f"❌ Error: {str(e)}", get_timeline_display(), None
+def get_timeline_display():
+    """Get timeline clips as formatted text"""
+    clips = timeline_service.get_all_clips()
+    if not clips:
+        return "📭 Timeline is empty. Generate clips to get started!"
+    total_duration = timeline_service.get_total_duration()
+    display = f"**📊 Timeline ({len(clips)} clips, {format_duration(total_duration)} total)**\n\n"
+    for i, clip in enumerate(clips, 1):
+        display += f"**{i}.** `{clip['clip_id'][:12]}...` | "
+        display += f"⏱️ {format_duration(clip['duration'])} | "
+        display += f"▶️ {format_duration(clip['start_time'])}\n"
+    return display
+def remove_clip(clip_number: int):
+    """Remove a clip from timeline"""
+    try:
+        clips = timeline_service.get_all_clips()
+        if not clips:
+            return "📭 Timeline is empty", get_timeline_display()
+        if clip_number < 1 or clip_number > len(clips):
+            return f"❌ Invalid clip number. Choose 1-{len(clips)}", get_timeline_display()
+        clip_id = clips[clip_number - 1]['clip_id']
+        timeline_service.remove_clip(clip_id)
+        return f"✅ Clip {clip_number} removed", get_timeline_display()
+    except Exception as e:
+        logger.error(f"Error removing clip: {e}", exc_info=True)
+        return f"❌ Error: {str(e)}", get_timeline_display()
+def clear_timeline():
+    """Clear all clips from timeline"""
+    try:
+        timeline_service.clear()
+        return "✅ Timeline cleared", get_timeline_display()
+    except Exception as e:
+        logger.error(f"Error clearing timeline: {e}", exc_info=True)
+        return f"❌ Error: {str(e)}", get_timeline_display()
+def export_timeline(filename: str, export_format: str, progress=gr.Progress()):
+    """Export timeline to audio file"""
+    try:
+        clips = timeline_service.get_all_clips()
+        if not clips:
+            return "❌ No clips to export", None
+        if not filename or not filename.strip():
+            filename = "output"
+        progress(0, desc="🔄 Merging clips...")
+        logger.info(f"Exporting timeline: {filename}.{export_format}")
+        export_service.timeline_service = timeline_service
+        progress(0.5, desc="💾 Encoding audio...")
+        output_path = export_service.merge_clips(
+            filename=filename,
+            export_format=export_format
+        )
+        if output_path:
+            progress(1.0, desc="✅ Export complete!")
+            return f"✅ Exported: {os.path.basename(output_path)}", output_path
+        else:
+            return "❌ Export failed", None
+    except Exception as e:
+        logger.error(f"Error exporting: {e}", exc_info=True)
+        return f"❌ Error: {str(e)}", None
+def format_duration(seconds: float) -> str:
+    """Format duration as MM:SS"""
+    mins = int(seconds // 60)
+    secs = int(seconds % 60)
+    return f"{mins}:{secs:02d}"
+# Create Gradio interface
+with gr.Blocks(
+    title="🎵 Music Generation Studio",
+    theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
+) as app:
+    gr.Markdown(
+        """
+        # 🎵 Music Generation Studio
+        Create AI-powered music with DiffRhythm2 and LyricMind AI
+        💡 **Tip**: Start with 10-20 second clips for faster generation on HuggingFace Spaces
+        """
+    )
+    with gr.Row():
+        # Left Column - Generation
+        with gr.Column(scale=2):
+            gr.Markdown("### 🎼 Music Generation")
+            prompt_input = gr.Textbox(
+                label="🎯 Music Prompt",
+                placeholder="energetic rock song with electric guitar at 140 BPM",
+                lines=3,
+                info="Describe the music style, instruments, tempo, and mood"
+            )
+            lyrics_mode = gr.Radio(
+                choices=["Instrumental", "User Lyrics", "Auto Lyrics"],
+                value="Instrumental",
+                label="🎤 Vocal Mode",
+                info="Instrumental: no vocals | User: provide lyrics | Auto: AI-generated"
+            )
+            with gr.Row():
+                auto_gen_btn = gr.Button("✍️ Generate Lyrics", size="sm")
+            lyrics_input = gr.Textbox(
+                label="📝 Lyrics",
+                placeholder="Enter lyrics or click 'Generate Lyrics'...",
+                lines=6
+            )
+            with gr.Row():
+                duration_input = gr.Slider(
+                    minimum=10,
+                    maximum=60,
+                    value=20,
+                    step=5,
+                    label="⏱️ Duration (seconds)",
+                    info="Shorter = faster generation"
+                )
+                position_input = gr.Radio(
+                    choices=["intro", "previous", "next", "outro"],
+                    value="next",
+                    label="📍 Position"
+                )
+            generate_btn = gr.Button(
+                "✨ Generate Music Clip",
+                variant="primary",
+                size="lg"
+            )
+            gen_status = gr.Textbox(label="📊 Status", lines=3, interactive=False)
+            audio_output = gr.Audio(label="🎧 Preview", type="filepath")
+        # Right Column - Timeline
+        with gr.Column(scale=1):
+            gr.Markdown("### 📊 Timeline")
+            timeline_display = gr.Textbox(
+                label="Clips",
+                value=get_timeline_display(),
+                lines=12,
+                interactive=False
+            )
+            with gr.Row():
+                clip_number_input = gr.Number(
+                    label="Clip #",
+                    precision=0,
+                    minimum=1,
+                    scale=1
+                )
+                remove_btn = gr.Button("🗑️ Remove", size="sm", scale=1)
+            clear_btn = gr.Button("🗑️ Clear All", variant="stop")
+            timeline_status = gr.Textbox(label="Status", lines=1, interactive=False)
+    # Export Section
+    gr.Markdown("---")
+    gr.Markdown("### 💾 Export")
+    with gr.Row():
+        export_filename = gr.Textbox(
+            label="Filename",
+            value="my_song",
+            scale=2
+        )
+        export_format = gr.Dropdown(
+            choices=["wav", "mp3"],
+            value="wav",
+            label="Format",
+            scale=1
+        )
+        export_btn = gr.Button("💾 Export", variant="primary", scale=1)
+    export_status = gr.Textbox(label="Status", lines=1, interactive=False)
+    export_audio = gr.Audio(label="📥 Download", type="filepath")
+    # Event handlers
+    auto_gen_btn.click(
+        fn=generate_lyrics,
+        inputs=[prompt_input, duration_input],
+        outputs=lyrics_input
+    )
+    generate_btn.click(
+        fn=generate_music,
+        inputs=[prompt_input, lyrics_input, lyrics_mode, duration_input, position_input],
+        outputs=[gen_status, timeline_display, audio_output]
+    )
+    remove_btn.click(
+        fn=remove_clip,
+        inputs=clip_number_input,
+        outputs=[timeline_status, timeline_display]
+    )
+    clear_btn.click(
+        fn=clear_timeline,
+        outputs=[timeline_status, timeline_display]
+    )
+    export_btn.click(
+        fn=export_timeline,
+        inputs=[export_filename, export_format],
+        outputs=[export_status, export_audio]
+    )
+    # Help section
+    with gr.Accordion("ℹ️ Help & Tips", open=False):
+        gr.Markdown(
+            """
+            ## 🚀 Quick Start
+            1. **Enter a prompt**: "upbeat pop song with synth at 128 BPM"
+            2. **Choose mode**: Instrumental (fastest) or with vocals
+            3. **Set duration**: Start with 10-20s for quick results
+            4. **Generate**: Click the button and wait ~2-4 minutes
+            5. **Export**: Download your complete song
+            ## ⚡ Performance Tips
+            - **Shorter clips = faster**: 10-20s clips generate in ~1-2 minutes
+            - **Instrumental mode**: ~30% faster than with vocals
+            - **HF Spaces uses CPU**: Expect 2-4 minutes per 30s clip
+            - **Build incrementally**: Generate short clips, then combine
+            ## 🎯 Prompt Tips
+            - **Be specific**: "energetic rock with distorted guitar" > "rock song"
+            - **Include BPM**: "at 140 BPM" helps set tempo
+            - **Mention instruments**: "with piano and drums"
+            - **Describe mood**: "melancholic", "upbeat", "aggressive"
+            ## 🎤 Vocal Modes
+            - **Instrumental**: Pure music, no vocals (fastest)
+            - **User Lyrics**: Provide your own lyrics
+            - **Auto Lyrics**: AI generates lyrics based on prompt
+            ## 📊 Timeline
+            - Clips are arranged sequentially
+            - Remove or clear clips as needed
+            - Export combines all clips into one file
+            ---
+            ⏱️ **Average Generation Time**: 2-4 minutes per 30-second clip on CPU
+            🎵 **Models**: DiffRhythm2 + MuQ-MuLan + LyricMind AI
+            """
+        )
+# Configure and launch
+if __name__ == "__main__":
+    logger.info("🎵 Starting Music Generation Studio on HuggingFace Spaces...")
+    app.queue(
+        default_concurrency_limit=1,
+        max_size=5
+    )
+    app.launch()

backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Backend package"""

backend/app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+Main Flask application for the Music Generation App
+"""
+import os
+import logging
+from flask import Flask, jsonify, send_from_directory
+from flask_cors import CORS
+from dotenv import load_dotenv
+from config.settings import Config
+from routes.generation import generation_bp
+from routes.timeline import timeline_bp
+from routes.export import export_bp
+from routes.mastering import mastering_bp
+from utils.logger import setup_logger
+# Load environment variables
+load_dotenv()
+def create_app(config_class=Config):
+    """Application factory pattern"""
+    app = Flask(__name__)
+    app.config.from_object(config_class)
+    # Enable CORS
+    CORS(app, resources={r"/api/*": {"origins": "*"}})
+    # Setup logging
+    setup_logger(app)
+    # Create necessary directories
+    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+    os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
+    os.makedirs(app.config['MODELS_DIR'], exist_ok=True)
+    os.makedirs('logs', exist_ok=True)
+    # Register blueprints
+    app.register_blueprint(generation_bp, url_prefix='/api/generation')
+    app.register_blueprint(timeline_bp, url_prefix='/api/timeline')
+    app.register_blueprint(export_bp, url_prefix='/api/export')
+    app.register_blueprint(mastering_bp, url_prefix='/api/mastering')
+    # Serve static files from outputs directory with proper MIME types
+    @app.route('/outputs/<path:filename>')
+    def serve_output(filename):
+        response = send_from_directory(app.config['OUTPUT_FOLDER'], filename)
+        # Ensure WAV files have correct MIME type
+        if filename.lower().endswith('.wav'):
+            response.headers['Content-Type'] = 'audio/wav'
+        elif filename.lower().endswith('.mp3'):
+            response.headers['Content-Type'] = 'audio/mpeg'
+        return response
+    # Health check endpoint
+    @app.route('/api/health')
+    def health_check():
+        return jsonify({
+            'status': 'healthy',
+            'version': '1.0.0'
+        })
+    # Error handlers
+    @app.errorhandler(404)
+    def not_found(error):
+        return jsonify({'error': 'Not found'}), 404
+    @app.errorhandler(500)
+    def internal_error(error):
+        app.logger.error(f'Internal server error: {str(error)}')
+        return jsonify({'error': 'Internal server error'}), 500
+    return app
+if __name__ == '__main__':
+    app = create_app()
+    port = int(os.getenv('PORT', 5000))
+    host = os.getenv('HOST', '0.0.0.0')
+    app.logger.info(f'Starting server on {host}:{port}')
+    app.run(host=host, port=port, debug=os.getenv('FLASK_DEBUG', 'False') == 'True')

backend/config/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""Configuration package"""
+from .settings import Config, config
+__all__ = ['Config', 'config']

backend/config/settings.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Application configuration settings
+"""
+import os
+from pathlib import Path
+class Config:
+    """Base configuration"""
+    # Base directory
+    BASE_DIR = Path(__file__).parent.parent.parent
+    # Flask settings
+    SECRET_KEY = os.getenv('SECRET_KEY', 'dev-secret-key-change-in-production')
+    DEBUG = os.getenv('FLASK_DEBUG', 'False') == 'True'
+    # File upload settings
+    UPLOAD_FOLDER = os.getenv('UPLOAD_FOLDER', str(BASE_DIR / 'uploads'))
+    OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', str(BASE_DIR / 'outputs'))
+    MAX_CONTENT_LENGTH = int(os.getenv('MAX_CONTENT_LENGTH', 16 * 1024 * 1024))  # 16MB
+    # Model paths
+    MODELS_DIR = BASE_DIR / 'models'
+    DIFFRHYTHM_MODEL_PATH = os.getenv('DIFFRHYTHM_MODEL_PATH', str(MODELS_DIR / 'diffrhythm2'))
+    FISH_SPEECH_MODEL_PATH = os.getenv('FISH_SPEECH_MODEL_PATH', str(MODELS_DIR / 'fish_speech'))
+    LYRICMIND_MODEL_PATH = os.getenv('LYRICMIND_MODEL_PATH', str(MODELS_DIR / 'lyricmind'))
+    # Generation settings
+    DEFAULT_CLIP_DURATION = int(os.getenv('DEFAULT_CLIP_DURATION', 30))
+    SAMPLE_RATE = int(os.getenv('SAMPLE_RATE', 44100))
+    BIT_DEPTH = int(os.getenv('BIT_DEPTH', 16))
+    # Logging
+    LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
+    LOG_FILE = os.getenv('LOG_FILE', str(BASE_DIR / 'logs' / 'app.log'))
+class DevelopmentConfig(Config):
+    """Development configuration"""
+    DEBUG = True
+class ProductionConfig(Config):
+    """Production configuration"""
+    DEBUG = False
+# Configuration dictionary
+config = {
+    'development': DevelopmentConfig,
+    'production': ProductionConfig,
+    'default': DevelopmentConfig
+}

backend/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Routes package"""
+from .generation import generation_bp
+from .timeline import timeline_bp
+from .export import export_bp
+__all__ = ['generation_bp', 'timeline_bp', 'export_bp']

backend/routes/export.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Routes for exporting/downloading music
+"""
+import logging
+import os
+from flask import Blueprint, request, jsonify, send_file, current_app
+from services.export_service import ExportService
+from models.schemas import ExportFormat
+logger = logging.getLogger(__name__)
+export_bp = Blueprint('export', __name__)
+export_service = ExportService()
+@export_bp.route('/merge', methods=['POST'])
+def merge_timeline():
+    """
+    Merge all clips in the timeline into a single file
+    Request body:
+    {
+        "format": "wav",  // wav, mp3, flac
+        "filename": "my_song"  // optional
+    }
+    """
+    try:
+        data = request.get_json() or {}
+        logger.info("Merging timeline clips")
+        # Validate format
+        export_format = data.get('format', 'wav')
+        try:
+            ExportFormat(export_format)
+        except ValueError:
+            return jsonify({
+                'error': f"Invalid format. Must be one of: {', '.join([f.value for f in ExportFormat])}"
+            }), 400
+        filename = data.get('filename', 'merged_output')
+        # Merge clips
+        output_path = export_service.merge_clips(
+            filename=filename,
+            export_format=export_format
+        )
+        if not output_path:
+            return jsonify({
+                'error': 'No clips to merge. Add clips to timeline first.'
+            }), 400
+        logger.info(f"Timeline merged successfully: {output_path}")
+        return jsonify({
+            'success': True,
+            'file_path': output_path,
+            'filename': os.path.basename(output_path)
+        })
+    except Exception as e:
+        logger.error(f"Error merging timeline: {str(e)}", exc_info=True)
+        return jsonify({
+            'error': 'Failed to merge timeline',
+            'details': str(e)
+        }), 500
+@export_bp.route('/download/<filename>', methods=['GET'])
+def download_file(filename):
+    """Download an exported file"""
+    try:
+        output_folder = current_app.config['OUTPUT_FOLDER']
+        file_path = os.path.join(output_folder, filename)
+        if not os.path.exists(file_path):
+            return jsonify({'error': 'File not found'}), 404
+        # Security check: ensure file is in output folder
+        if not os.path.abspath(file_path).startswith(os.path.abspath(output_folder)):
+            return jsonify({'error': 'Invalid file path'}), 403
+        logger.info(f"Downloading file: {filename}")
+        return send_file(
+            file_path,
+            as_attachment=True,
+            download_name=filename
+        )
+    except Exception as e:
+        logger.error(f"Error downloading file: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500
+@export_bp.route('/export-clip/<clip_id>', methods=['GET'])
+def export_single_clip(clip_id):
+    """Export a single clip"""
+    try:
+        export_format = request.args.get('format', 'wav')
+        try:
+            ExportFormat(export_format)
+        except ValueError:
+            return jsonify({
+                'error': f"Invalid format. Must be one of: {', '.join([f.value for f in ExportFormat])}"
+            }), 400
+        logger.info(f"Exporting single clip: {clip_id}")
+        output_path = export_service.export_clip(
+            clip_id=clip_id,
+            export_format=export_format
+        )
+        if not output_path:
+            return jsonify({'error': 'Clip not found'}), 404
+        return jsonify({
+            'success': True,
+            'file_path': output_path,
+            'filename': os.path.basename(output_path)
+        })
+    except Exception as e:
+        logger.error(f"Error exporting clip: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500

backend/routes/generation.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Routes for music generation
+"""
+import os
+import logging
+from flask import Blueprint, request, jsonify, current_app
+from services.diffrhythm_service import DiffRhythmService
+from services.lyricmind_service import LyricMindService
+from services.style_consistency_service import StyleConsistencyService
+from services.timeline_service import TimelineService
+from models.schemas import GenerationRequest, LyricsRequest
+from utils.validators import validate_generation_params
+from utils.prompt_analyzer import PromptAnalyzer
+logger = logging.getLogger(__name__)
+generation_bp = Blueprint('generation', __name__)
+# Initialize services (lazy loading)
+diffrhythm_service = None
+lyricmind_service = None
+style_service = None
+timeline_service = None
+def get_diffrhythm_service():
+    """Get or create DiffRhythm service instance"""
+    global diffrhythm_service
+    if diffrhythm_service is None:
+        diffrhythm_service = DiffRhythmService(
+            model_path=current_app.config['DIFFRHYTHM_MODEL_PATH']
+        )
+    return diffrhythm_service
+def get_lyricmind_service():
+    """Get or create LyricMind service instance"""
+    global lyricmind_service
+    if lyricmind_service is None:
+        lyricmind_service = LyricMindService(
+            model_path=current_app.config['LYRICMIND_MODEL_PATH']
+        )
+    return lyricmind_service
+def get_style_service():
+    """Get or create Style Consistency service instance"""
+    global style_service
+    if style_service is None:
+        style_service = StyleConsistencyService()
+    return style_service
+def get_timeline_service():
+    """Get or create Timeline service instance"""
+    global timeline_service
+    if timeline_service is None:
+        timeline_service = TimelineService()
+    return timeline_service
+@generation_bp.route('/generate-lyrics', methods=['POST'])
+def generate_lyrics():
+    """Generate lyrics from prompt using LyricMind AI with prompt analysis"""
+    try:
+        data = LyricsRequest(**request.json)
+        # Analyze prompt for better context
+        logger.info(f"Analyzing prompt for lyrics generation: {data.prompt}")
+        prompt_analysis = PromptAnalyzer.analyze(data.prompt)
+        logger.info(f"Prompt analysis: {prompt_analysis}")
+        # Get lyrics service
+        lyrics_service = get_lyricmind_service()
+        # Generate lyrics with analysis context
+        style = data.style or (prompt_analysis.get('genres', [''])[0] if prompt_analysis.get('genres') else None)
+        logger.info(f"Generating lyrics with style: {style}")
+        lyrics = lyrics_service.generate(
+            prompt=data.prompt,
+            style=style,
+            duration=data.duration,
+            prompt_analysis=prompt_analysis
+        )
+        return jsonify({
+            'lyrics': lyrics,
+            'analysis': prompt_analysis
+        })
+    except ValueError as e:
+        logger.error(f"Validation error: {str(e)}")
+        return jsonify({'error': str(e)}), 400
+    except Exception as e:
+        logger.error(f"Error generating lyrics: {str(e)}", exc_info=True)
+        return jsonify({'error': f'Failed to generate lyrics: {str(e)}'}), 500
+@generation_bp.route('/generate-music', methods=['POST'])
+def generate_music():
+    """
+    Generate music clip from prompt with optional vocals
+    Request body:
+    {
+        "prompt": "upbeat pop song with drums",
+        "lyrics": "optional lyrics text",
+        "duration": 30
+    }
+    """
+    try:
+        data = request.get_json()
+        logger.info(f"Received music generation request: {data.get('prompt', 'No prompt')}")
+        # Validate request
+        validation_error = validate_generation_params(data)
+        if validation_error:
+            return jsonify({'error': validation_error}), 400
+        # Parse request
+        gen_request = GenerationRequest(**data)
+        # Analyze prompt for musical attributes
+        prompt_analysis = PromptAnalyzer.analyze(gen_request.prompt)
+        logger.info(f"Prompt analysis: {prompt_analysis['analysis_text']}")
+        # Get timeline clips for style consistency
+        timeline_svc = get_timeline_service()
+        existing_clips = timeline_svc.get_all_clips()
+        # Prepare style guidance if clips exist
+        reference_audio = None
+        style_profile = {}
+        enhanced_prompt = gen_request.prompt
+        if existing_clips:
+            logger.info(f"Found {len(existing_clips)} existing clips - applying style consistency")
+            style_svc = get_style_service()
+            reference_audio, style_profile = style_svc.get_style_guidance_for_generation(existing_clips)
+            # Enhance prompt with style characteristics
+            enhanced_prompt = style_svc.enhance_prompt_with_style(gen_request.prompt, style_profile)
+            logger.info(f"Enhanced prompt for style consistency: {enhanced_prompt}")
+        else:
+            logger.info("No existing clips - generating without style guidance")
+        # Generate music with DiffRhythm2 (includes vocals if lyrics provided)
+        service = get_diffrhythm_service()
+        lyrics_to_use = gen_request.lyrics if gen_request.lyrics else None
+        final_path = service.generate(
+            prompt=enhanced_prompt,
+            duration=gen_request.duration,
+            lyrics=lyrics_to_use,
+            reference_audio=reference_audio
+        )
+        logger.info(f"Music generation successful: {final_path}")
+        # Convert filesystem path to URL path (forward slashes, relative to outputs)
+        relative_path = os.path.relpath(final_path, 'outputs')
+        url_path = f"/outputs/{relative_path.replace(os.sep, '/')}"
+        return jsonify({
+            'success': True,
+            'clip_id': os.path.basename(final_path).split('.')[0],
+            'file_path': url_path,
+            'duration': gen_request.duration,
+            'analysis': prompt_analysis,
+            'style_consistent': len(existing_clips) > 0,
+            'num_reference_clips': len(existing_clips)
+        })
+    except Exception as e:
+        logger.error(f"Error generating music: {str(e)}", exc_info=True)
+        return jsonify({
+            'error': 'Failed to generate music',
+            'details': str(e)
+        }), 500
+@generation_bp.route('/status', methods=['GET'])
+def get_status():
+    """Check if generation services are available"""
+    try:
+        status = {
+            'diffrhythm': diffrhythm_service is not None
+        }
+        return jsonify({
+            'services': status,
+            'ready': status['diffrhythm']
+        })
+    except Exception as e:
+        logger.error(f"Error checking status: {str(e)}")
+        return jsonify({'error': str(e)}), 500

backend/routes/mastering.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Routes for audio mastering and EQ
+"""
+import os
+import logging
+from flask import Blueprint, request, jsonify, current_app, send_file
+from services.mastering_service import MasteringService
+from pathlib import Path
+logger = logging.getLogger(__name__)
+mastering_bp = Blueprint('mastering', __name__)
+# Initialize service
+mastering_service = None
+def get_mastering_service():
+    """Get or create mastering service instance"""
+    global mastering_service
+    if mastering_service is None:
+        mastering_service = MasteringService()
+    return mastering_service
+@mastering_bp.route('/presets', methods=['GET'])
+def get_presets():
+    """Get list of all available mastering presets"""
+    try:
+        service = get_mastering_service()
+        presets = service.get_preset_list()
+        return jsonify({'presets': presets})
+    except Exception as e:
+        logger.error(f"Error getting presets: {str(e)}", exc_info=True)
+        return jsonify({'error': 'Failed to get presets'}), 500
+@mastering_bp.route('/apply-preset', methods=['POST'])
+def apply_preset():
+    """Apply mastering preset to audio clip"""
+    try:
+        data = request.json
+        clip_id = data.get('clip_id')
+        preset_name = data.get('preset')
+        audio_path = data.get('audio_path')
+        if not all([clip_id, preset_name, audio_path]):
+            return jsonify({'error': 'Missing required parameters'}), 400
+        # Verify audio file exists
+        if not os.path.exists(audio_path):
+            return jsonify({'error': 'Audio file not found'}), 404
+        # Generate output path
+        output_dir = Path(current_app.config['OUTPUT_FOLDER']) / 'mastered'
+        output_dir.mkdir(parents=True, exist_ok=True)
+        filename = Path(audio_path).stem
+        output_path = output_dir / f"{filename}_mastered_{preset_name}.wav"
+        # Apply preset
+        service = get_mastering_service()
+        processed_path = service.apply_preset(audio_path, preset_name, str(output_path))
+        # Return URL to processed file
+        relative_path = os.path.relpath(processed_path, current_app.config['OUTPUT_FOLDER'])
+        file_url = f"/outputs/{relative_path.replace(os.sep, '/')}"
+        return jsonify({
+            'success': True,
+            'processed_path': file_url,
+            'clip_id': clip_id,
+            'preset': preset_name
+        })
+    except ValueError as e:
+        logger.error(f"Validation error: {str(e)}")
+        return jsonify({'error': str(e)}), 400
+    except Exception as e:
+        logger.error(f"Error applying preset: {str(e)}", exc_info=True)
+        return jsonify({'error': f'Failed to apply preset: {str(e)}'}), 500
+@mastering_bp.route('/apply-custom-eq', methods=['POST'])
+def apply_custom_eq():
+    """Apply custom EQ settings to audio clip"""
+    try:
+        data = request.json
+        clip_id = data.get('clip_id')
+        audio_path = data.get('audio_path')
+        eq_bands = data.get('eq_bands', [])
+        compression = data.get('compression')
+        limiting = data.get('limiting')
+        if not all([clip_id, audio_path]):
+            return jsonify({'error': 'Missing required parameters'}), 400
+        # Verify audio file exists
+        if not os.path.exists(audio_path):
+            return jsonify({'error': 'Audio file not found'}), 404
+        # Generate output path
+        output_dir = Path(current_app.config['OUTPUT_FOLDER']) / 'mastered'
+        output_dir.mkdir(parents=True, exist_ok=True)
+        filename = Path(audio_path).stem
+        output_path = output_dir / f"{filename}_custom_eq.wav"
+        # Apply custom EQ
+        service = get_mastering_service()
+        processed_path = service.apply_custom_eq(
+            audio_path,
+            str(output_path),
+            eq_bands,
+            compression,
+            limiting
+        )
+        # Return URL to processed file
+        relative_path = os.path.relpath(processed_path, current_app.config['OUTPUT_FOLDER'])
+        file_url = f"/outputs/{relative_path.replace(os.sep, '/')}"
+        return jsonify({
+            'success': True,
+            'processed_path': file_url,
+            'clip_id': clip_id
+        })
+    except Exception as e:
+        logger.error(f"Error applying custom EQ: {str(e)}", exc_info=True)
+        return jsonify({'error': f'Failed to apply custom EQ: {str(e)}'}), 500
+@mastering_bp.route('/preview', methods=['POST'])
+def preview_mastering():
+    """Preview mastering effect (non-destructive)"""
+    try:
+        data = request.json
+        clip_id = data.get('clip_id')
+        audio_path = data.get('audio_path')
+        preset_name = data.get('preset')
+        eq_bands = data.get('eq_bands')
+        if not all([clip_id, audio_path]):
+            return jsonify({'error': 'Missing required parameters'}), 400
+        # Verify audio file exists
+        if not os.path.exists(audio_path):
+            return jsonify({'error': 'Audio file not found'}), 404
+        # Generate temp output path for preview
+        output_dir = Path(current_app.config['OUTPUT_FOLDER']) / 'preview'
+        output_dir.mkdir(parents=True, exist_ok=True)
+        filename = Path(audio_path).stem
+        output_path = output_dir / f"{filename}_preview.wav"
+        service = get_mastering_service()
+        if preset_name:
+            # Apply preset for preview
+            processed_path = service.apply_preset(audio_path, preset_name, str(output_path))
+        elif eq_bands:
+            # Apply custom EQ for preview
+            compression = data.get('compression')
+            limiting = data.get('limiting')
+            processed_path = service.apply_custom_eq(
+                audio_path,
+                str(output_path),
+                eq_bands,
+                compression,
+                limiting
+            )
+        else:
+            return jsonify({'error': 'No preset or EQ settings provided'}), 400
+        # Return URL to preview file
+        # Use absolute path from project root for frontend to access
+        relative_path = os.path.relpath(processed_path, 'outputs')
+        file_url = f"http://localhost:7860/outputs/{relative_path.replace(os.sep, '/')}"
+        return jsonify({
+            'success': True,
+            'preview_path': file_url,
+            'clip_id': clip_id
+        })
+    except Exception as e:
+        logger.error(f"Error generating preview: {str(e)}", exc_info=True)
+        return jsonify({'error': f'Failed to generate preview: {str(e)}'}), 500

backend/routes/timeline.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Routes for timeline management
+"""
+import logging
+from flask import Blueprint, request, jsonify
+from services.timeline_service import TimelineService
+from models.schemas import ClipPosition
+logger = logging.getLogger(__name__)
+timeline_bp = Blueprint('timeline', __name__)
+timeline_service = TimelineService()
+@timeline_bp.route('/clips', methods=['GET'])
+def get_clips():
+    """Get all clips in the timeline"""
+    try:
+        clips = timeline_service.get_all_clips()
+        return jsonify({
+            'success': True,
+            'clips': clips,
+            'total_duration': timeline_service.get_total_duration()
+        })
+    except Exception as e:
+        logger.error(f"Error fetching clips: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500
+@timeline_bp.route('/clips', methods=['POST'])
+def add_clip():
+    """
+    Add a clip to the timeline
+    Request body:
+    {
+        "clip_id": "unique_id",
+        "file_path": "/path/to/clip.wav",
+        "duration": 30,
+        "position": "next"  // intro, previous, next, outro
+    }
+    """
+    try:
+        data = request.get_json()
+        logger.info(f"Adding clip to timeline: {data.get('clip_id')}")
+        # Validate required fields
+        required_fields = ['clip_id', 'file_path', 'duration', 'position']
+        for field in required_fields:
+            if field not in data:
+                return jsonify({'error': f'Missing required field: {field}'}), 400
+        # Validate position
+        try:
+            position = ClipPosition(data['position'])
+        except ValueError:
+            return jsonify({
+                'error': f"Invalid position. Must be one of: {', '.join([p.value for p in ClipPosition])}"
+            }), 400
+        # Add clip to timeline
+        result = timeline_service.add_clip(
+            clip_id=data['clip_id'],
+            file_path=data['file_path'],
+            duration=data['duration'],
+            position=position
+        )
+        logger.info(f"Clip added successfully at position: {result['timeline_position']}")
+        return jsonify({
+            'success': True,
+            **result
+        })
+    except Exception as e:
+        logger.error(f"Error adding clip: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500
+@timeline_bp.route('/clips/<clip_id>', methods=['DELETE'])
+def remove_clip(clip_id):
+    """Remove a clip from the timeline"""
+    try:
+        logger.info(f"Removing clip: {clip_id}")
+        timeline_service.remove_clip(clip_id)
+        return jsonify({
+            'success': True,
+            'message': f'Clip {clip_id} removed'
+        })
+    except Exception as e:
+        logger.error(f"Error removing clip: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500
+@timeline_bp.route('/clips/reorder', methods=['POST'])
+def reorder_clips():
+    """
+    Reorder clips in the timeline
+    Request body:
+    {
+        "clip_ids": ["id1", "id2", "id3"]
+    }
+    """
+    try:
+        data = request.get_json()
+        clip_ids = data.get('clip_ids', [])
+        if not clip_ids:
+            return jsonify({'error': 'clip_ids array is required'}), 400
+        logger.info(f"Reordering clips: {clip_ids}")
+        timeline_service.reorder_clips(clip_ids)
+        return jsonify({
+            'success': True,
+            'message': 'Clips reordered successfully'
+        })
+    except Exception as e:
+        logger.error(f"Error reordering clips: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500
+@timeline_bp.route('/clear', methods=['POST'])
+def clear_timeline():
+    """Clear all clips from the timeline"""
+    try:
+        logger.info("Clearing timeline")
+        timeline_service.clear()
+        return jsonify({
+            'success': True,
+            'message': 'Timeline cleared'
+        })
+    except Exception as e:
+        logger.error(f"Error clearing timeline: {str(e)}", exc_info=True)
+        return jsonify({'error': str(e)}), 500

backend/run.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Startup script for the Music Generation App
+"""
+import sys
+import os
+import signal
+# Add backend directory to Python path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from app import create_app
+def signal_handler(sig, frame):
+    """Handle shutdown signals gracefully"""
+    print('\n\n[INFO] Shutting down server...')
+    sys.exit(0)
+if __name__ == '__main__':
+    try:
+        app = create_app()
+        port = int(os.getenv('PORT', 7860))  # Default to 7860 to match frontend expectations
+        host = os.getenv('HOST', '0.0.0.0')
+        print(f"""
+    ================================================================
+       Music Generation App Server Starting...
+    ================================================================
+    Server running at: http://{host}:{port}
+    API endpoints: http://{host}:{port}/api
+    Health check: http://{host}:{port}/api/health
+    Press Ctrl+C to stop the server
+    ================================================================
+    """)
+        # Register signal handlers
+        signal.signal(signal.SIGINT, signal_handler)
+        signal.signal(signal.SIGTERM, signal_handler)
+        # Use waitress for production-ready server
+        from waitress import serve
+        print('[INFO] Server is ready!')
+        serve(app, host=host, port=port, threads=4)
+    except Exception as e:
+        print(f"\n[ERROR] Failed to start server: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)

backend/services/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Services package"""
+from .diffrhythm_service import DiffRhythmService
+from .timeline_service import TimelineService
+from .export_service import ExportService
+from .fish_speech_service import FishSpeechService
+__all__ = [
+    'DiffRhythmService',
+    'TimelineService',
+    'ExportService',
+    'FishSpeechService'
+]

backend/services/diffrhythm_service.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""
+DiffRhythm 2 music generation service
+Integrates with the DiffRhythm 2 model for music generation with vocals
+"""
+import os
+import sys
+import logging
+import uuid
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+import json
+# Configure espeak-ng path for phonemizer (required by g2p module)
+# Note: Environment configuration handled by hf_config.py for HuggingFace Spaces
+# or by launch scripts for local development
+if "PHONEMIZER_ESPEAK_PATH" not in os.environ:
+    # Fallback for local development without launcher
+    espeak_path = Path(__file__).parent.parent.parent / "external" / "espeak-ng"
+    if espeak_path.exists():
+        os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = str(espeak_path / "libespeak-ng.dll")
+        os.environ["PHONEMIZER_ESPEAK_PATH"] = str(espeak_path)
+# Add DiffRhythm2 source code to path (cloned repo, not pip package)
+diffrhythm2_src = Path(__file__).parent.parent.parent / "models" / "diffrhythm2_source"
+sys.path.insert(0, str(diffrhythm2_src))
+logger = logging.getLogger(__name__)
+class DiffRhythmService:
+    """Service for DiffRhythm 2 music generation"""
+    def __init__(self, model_path: str):
+        """
+        Initialize DiffRhythm 2 service
+        Args:
+            model_path: Path to DiffRhythm 2 model files
+        """
+        self.model_path = model_path
+        self.model = None
+        self.mulan = None
+        self.lrc_tokenizer = None
+        self.decoder = None
+        self.is_initialized = False
+        self.device = self._get_device()
+        logger.info(f"DiffRhythm 2 service created with model path: {model_path}")
+        logger.info(f"Using device: {self.device}")
+    def _get_device(self):
+        """Get compute device (CUDA or CPU)"""
+        # Try CUDA first (NVIDIA)
+        if torch.cuda.is_available():
+            logger.info("Using CUDA (NVIDIA GPU)")
+            return torch.device("cuda")
+        # Note: DirectML support disabled due to version conflicts with DiffRhythm2
+        # DiffRhythm2 requires torch>=2.4, but torch-directml requires torch==2.4.1
+        # For AMD GPU acceleration, consider using ROCm with compatible PyTorch build
+        # Fallback to CPU
+        logger.info("Using CPU (no GPU acceleration)")
+        return torch.device("cpu")
+    def _initialize_model(self):
+        """Lazy load the DiffRhythm 2 model when first needed"""
+        if self.is_initialized:
+            return
+        try:
+            logger.info("Initializing DiffRhythm 2 model...")
+            from diffrhythm2.cfm import CFM
+            from diffrhythm2.backbones.dit import DiT
+            from bigvgan.model import Generator
+            from muq import MuQMuLan
+            from huggingface_hub import hf_hub_download
+            from safetensors.torch import load_file
+            # Load DiffRhythm 2 model
+            repo_id = "ASLP-lab/DiffRhythm2"
+            # Download model files
+            model_ckpt = hf_hub_download(
+                repo_id=repo_id,
+                filename="model.safetensors",
+                local_dir=self.model_path,
+                local_files_only=False,
+            )
+            model_config_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="config.json",
+                local_dir=self.model_path,
+                local_files_only=False,
+            )
+            # Load config
+            with open(model_config_path) as f:
+                model_config = json.load(f)
+            model_config['use_flex_attn'] = False
+            # Create model
+            self.model = CFM(
+                transformer=DiT(**model_config),
+                num_channels=model_config['mel_dim'],
+                block_size=model_config['block_size'],
+            )
+            # Load weights
+            ckpt = load_file(model_ckpt)
+            self.model.load_state_dict(ckpt)
+            self.model = self.model.to(self.device)
+            # Load MuLan for style encoding
+            self.mulan = MuQMuLan.from_pretrained(
+                "OpenMuQ/MuQ-MuLan-large",
+                cache_dir=os.path.join(self.model_path, "mulan")
+            ).to(self.device)
+            # Load tokenizer
+            from g2p.g2p_generation import chn_eng_g2p
+            vocab_path = os.path.join(self.model_path, "vocab.json")
+            if not os.path.exists(vocab_path):
+                # Download vocab
+                vocab_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename="g2p/g2p/vocab.json",
+                    local_dir=self.model_path,
+                    local_files_only=False,
+                )
+            with open(vocab_path, 'r') as f:
+                phone2id = json.load(f)['vocab']
+            self.lrc_tokenizer = {
+                'phone2id': phone2id,
+                'g2p': chn_eng_g2p
+            }
+            # Load decoder (BigVGAN vocoder)
+            decoder_ckpt = hf_hub_download(
+                repo_id=repo_id,
+                filename="decoder.bin",
+                local_dir=self.model_path,
+                local_files_only=False,
+            )
+            decoder_config = hf_hub_download(
+                repo_id=repo_id,
+                filename="decoder.json",
+                local_dir=self.model_path,
+                local_files_only=False,
+            )
+            self.decoder = Generator(decoder_config, decoder_ckpt)
+            self.decoder = self.decoder.to(self.device)
+            logger.info("✅ DiffRhythm 2 model loaded successfully")
+            self.is_initialized = True
+            logger.info("DiffRhythm 2 service initialized")
+        except Exception as e:
+            logger.error(f"Failed to initialize DiffRhythm 2: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Could not load DiffRhythm 2 model: {str(e)}")
+    def generate(
+        self,
+        prompt: str,
+        duration: int = 30,
+        sample_rate: int = 44100,
+        lyrics: Optional[str] = None,
+        reference_audio: Optional[str] = None
+    ) -> str:
+        """
+        Generate music from text prompt with optional vocals/lyrics and style reference
+        Args:
+            prompt: Text description of desired music
+            duration: Length in seconds
+            sample_rate: Audio sample rate
+            lyrics: Optional lyrics for vocals
+            reference_audio: Optional path to reference audio for style consistency
+        Returns:
+            Path to generated audio file
+        """
+        try:
+            self._initialize_model()
+            if lyrics:
+                logger.info(f"Generating music with vocals: prompt='{prompt}', lyrics_length={len(lyrics)}")
+            else:
+                logger.info(f"Generating instrumental music: prompt='{prompt}'")
+            if reference_audio and os.path.exists(reference_audio):
+                logger.info(f"Using style reference: {reference_audio}")
+            logger.info(f"Duration={duration}s")
+            # Try to generate with DiffRhythm 2
+            if self.model is not None:
+                audio = self._generate_with_diffrhythm2(prompt, lyrics, duration, sample_rate, reference_audio)
+            else:
+                # Fallback: Generate placeholder
+                logger.warning("Using placeholder audio generation")
+                audio = self._generate_placeholder(duration, sample_rate)
+            # Save to file
+            output_dir = os.path.join('outputs', 'music')
+            os.makedirs(output_dir, exist_ok=True)
+            clip_id = str(uuid.uuid4())
+            output_path = os.path.join(output_dir, f"{clip_id}.wav")
+            # Ensure audio is in correct format (channels, samples) for soundfile
+            # If audio is 1D (mono), keep it as is. If 2D, ensure it's (samples, channels)
+            if audio.ndim == 1:
+                # Mono audio - soundfile expects (samples,) shape
+                sf.write(output_path, audio, sample_rate)
+            else:
+                # Stereo/multi-channel - soundfile expects (samples, channels)
+                sf.write(output_path, audio, sample_rate)
+            logger.info(f"Music generated successfully: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Music generation failed: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Failed to generate music: {str(e)}")
+    def _generate_with_diffrhythm2(
+        self,
+        prompt: str,
+        lyrics: Optional[str],
+        duration: int,
+        sample_rate: int,
+        reference_audio: Optional[str] = None
+    ) -> np.ndarray:
+        """
+        Generate music using DiffRhythm 2 model with optional style reference
+        Args:
+            prompt: Music description (used as style prompt)
+            lyrics: Lyrics for vocals (required for vocal generation)
+            duration: Duration in seconds
+            sample_rate: Sample rate
+            reference_audio: Optional path to reference audio for style guidance
+        Returns:
+            Audio array
+        """
+        try:
+            logger.info("Generating with DiffRhythm 2 model...")
+            # Prepare lyrics tokens
+            if lyrics:
+                lyrics_token = self._tokenize_lyrics(lyrics)
+            else:
+                # For instrumental, use empty structure
+                lyrics_token = torch.tensor([500, 511], dtype=torch.long, device=self.device)  # [start][stop]
+            # Encode style prompt with optional reference audio blending
+            with torch.no_grad():
+                if reference_audio and os.path.exists(reference_audio):
+                    try:
+                        import torchaudio
+                        # Load reference audio
+                        ref_waveform, ref_sr = torchaudio.load(reference_audio)
+                        if ref_sr != 24000:  # MuLan expects 24kHz
+                            ref_waveform = torchaudio.functional.resample(ref_waveform, ref_sr, 24000)
+                        # Encode reference audio with MuLan
+                        ref_waveform = ref_waveform.to(self.device)
+                        audio_style_embed = self.mulan(audios=ref_waveform.unsqueeze(0))
+                        text_style_embed = self.mulan(texts=[prompt])
+                        # Blend reference audio style with text prompt (70% audio, 30% text)
+                        style_prompt_embed = 0.7 * audio_style_embed + 0.3 * text_style_embed
+                        logger.info("Using blended style: 70% reference audio + 30% text prompt")
+                    except Exception as e:
+                        logger.warning(f"Failed to use reference audio, using text prompt only: {e}")
+                        style_prompt_embed = self.mulan(texts=[prompt])
+                else:
+                    style_prompt_embed = self.mulan(texts=[prompt])
+            style_prompt_embed = style_prompt_embed.to(self.device).squeeze(0)
+            # Use FP16 if on GPU
+            if self.device.type != 'cpu':
+                self.model = self.model.half()
+                self.decoder = self.decoder.half()
+                style_prompt_embed = style_prompt_embed.half()
+            # Generate latent representation
+            with torch.inference_mode():
+                latent = self.model.sample_block_cache(
+                    text=lyrics_token.unsqueeze(0),
+                    duration=int(duration * 5),  # DiffRhythm uses 5 frames per second
+                    style_prompt=style_prompt_embed.unsqueeze(0),
+                    steps=16,  # Sampling steps
+                    cfg_strength=2.0,  # Classifier-free guidance
+                    process_bar=False
+                )
+                # Decode to audio
+                latent = latent.transpose(1, 2)
+                audio = self.decoder.decode_audio(latent, overlap=5, chunk_size=20)
+            # Convert to numpy
+            audio = audio.float().cpu().numpy().squeeze()
+            # Ensure correct length
+            target_length = int(duration * sample_rate)
+            if len(audio) > target_length:
+                audio = audio[:target_length]
+            elif len(audio) < target_length:
+                audio = np.pad(audio, (0, target_length - len(audio)))
+            # Resample if needed
+            if sample_rate != 24000:  # DiffRhythm 2 native sample rate
+                import scipy.signal as signal
+                audio = signal.resample(audio, target_length)
+            logger.info("✅ DiffRhythm 2 generation successful")
+            return audio.astype(np.float32)
+        except Exception as e:
+            logger.error(f"DiffRhythm 2 generation failed: {str(e)}")
+            return self._generate_placeholder(duration, sample_rate)
+    def _tokenize_lyrics(self, lyrics: str) -> torch.Tensor:
+        """
+        Tokenize lyrics for DiffRhythm 2
+        Args:
+            lyrics: Lyrics text
+        Returns:
+            Tokenized lyrics tensor
+        """
+        try:
+            # Structure tags
+            STRUCT_INFO = {
+                "[start]": 500,
+                "[end]": 501,
+                "[intro]": 502,
+                "[verse]": 503,
+                "[chorus]": 504,
+                "[outro]": 505,
+                "[inst]": 506,
+                "[solo]": 507,
+                "[bridge]": 508,
+                "[hook]": 509,
+                "[break]": 510,
+                "[stop]": 511,
+                "[space]": 512
+            }
+            # Convert lyrics to phonemes and tokens
+            phone, tokens = self.lrc_tokenizer['g2p'](lyrics)
+            tokens = [x + 1 for x in tokens]  # Offset by 1
+            # Add structure: [start] + lyrics + [stop]
+            lyrics_tokens = [STRUCT_INFO['[start]']] + tokens + [STRUCT_INFO['[stop]']]
+            return torch.tensor(lyrics_tokens, dtype=torch.long, device=self.device)
+        except Exception as e:
+            logger.error(f"Lyrics tokenization failed: {str(e)}")
+            # Return minimal structure
+            return torch.tensor([500, 511], dtype=torch.long, device=self.device)
+    def _generate_placeholder(self, duration: int, sample_rate: int) -> np.ndarray:
+        """
+        Generate placeholder audio (for testing without actual model)
+        Args:
+            duration: Length in seconds
+            sample_rate: Sample rate
+        Returns:
+            Audio array
+        """
+        logger.warning("Using placeholder audio - DiffRhythm 2 model not loaded")
+        # Generate simple sine wave as placeholder
+        t = np.linspace(0, duration, int(duration * sample_rate))
+        frequency = 440  # A4 note
+        audio = 0.3 * np.sin(2 * np.pi * frequency * t)
+        return audio.astype(np.float32)

backend/services/export_service.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Export and merge service
+"""
+import os
+import logging
+from typing import Optional, List
+import numpy as np
+import soundfile as sf
+from services.timeline_service import TimelineService
+logger = logging.getLogger(__name__)
+class ExportService:
+    """Service for exporting and merging audio"""
+    def __init__(self):
+        """Initialize export service"""
+        self.timeline_service = TimelineService()
+        logger.info("Export service initialized")
+    def merge_clips(
+        self,
+        filename: str = "output",
+        export_format: str = "wav"
+    ) -> Optional[str]:
+        """
+        Merge all timeline clips into a single file
+        Args:
+            filename: Output filename (without extension)
+            export_format: Output format (wav, mp3, flac)
+        Returns:
+            Path to merged file, or None if no clips
+        """
+        try:
+            clips = self.timeline_service.get_all_clips()
+            if not clips:
+                logger.warning("No clips to merge")
+                return None
+            logger.info(f"Merging {len(clips)} clips")
+            # Load all clips
+            audio_data = []
+            sample_rate = None
+            for clip in clips:
+                audio, sr = sf.read(clip['file_path'])
+                if sample_rate is None:
+                    sample_rate = sr
+                elif sr != sample_rate:
+                    logger.warning(f"Sample rate mismatch: {sr} vs {sample_rate}")
+                    # Could resample here if needed
+                audio_data.append(audio)
+            # Concatenate all clips
+            merged_audio = np.concatenate(audio_data)
+            # Normalize
+            max_val = np.abs(merged_audio).max()
+            if max_val > 0:
+                merged_audio = merged_audio / max_val * 0.95
+            # Save merged file
+            output_dir = 'outputs'
+            os.makedirs(output_dir, exist_ok=True)
+            output_path = os.path.join(output_dir, f"{filename}.{export_format}")
+            sf.write(output_path, merged_audio, sample_rate)
+            logger.info(f"Clips merged successfully: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Failed to merge clips: {str(e)}", exc_info=True)
+            raise
+    def export_clip(
+        self,
+        clip_id: str,
+        export_format: str = "wav"
+    ) -> Optional[str]:
+        """
+        Export a single clip
+        Args:
+            clip_id: ID of clip to export
+            export_format: Output format
+        Returns:
+            Path to exported file, or None if clip not found
+        """
+        try:
+            clip = self.timeline_service.get_clip(clip_id)
+            if not clip:
+                logger.warning(f"Clip not found: {clip_id}")
+                return None
+            logger.info(f"Exporting clip: {clip_id}")
+            # Load clip
+            audio, sr = sf.read(clip.file_path)
+            # Export with requested format
+            output_dir = 'outputs'
+            os.makedirs(output_dir, exist_ok=True)
+            output_path = os.path.join(output_dir, f"{clip_id}.{export_format}")
+            sf.write(output_path, audio, sr)
+            logger.info(f"Clip exported: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Failed to export clip: {str(e)}", exc_info=True)
+            raise

backend/services/fish_speech_service.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Fish Speech TTS/vocals service
+"""
+import os
+import logging
+import uuid
+import torch
+from typing import Optional
+import numpy as np
+import soundfile as sf
+logger = logging.getLogger(__name__)
+class FishSpeechService:
+    """Service for Fish Speech TTS and vocal synthesis"""
+    def __init__(self, model_path: str):
+        """
+        Initialize Fish Speech service
+        Args:
+            model_path: Path to Fish Speech model files
+        """
+        self.model_path = model_path
+        self.model = None
+        self.vocoder = None
+        self.is_initialized = False
+        self.device = self._get_device()
+        logger.info(f"Fish Speech service created with model path: {model_path}")
+        logger.info(f"Using device: {self.device}")
+    def _get_device(self):
+        """Get compute device (AMD GPU via DirectML or CPU)"""
+        try:
+            from utils.amd_gpu import DEFAULT_DEVICE
+            return DEFAULT_DEVICE
+        except:
+            return torch.device("cpu")
+    def _initialize_model(self):
+        """Lazy load the model when first needed"""
+        if self.is_initialized:
+            return
+        try:
+            logger.info("Initializing Fish Speech model...")
+            # TODO: Load actual Fish Speech model
+            # from fish_speech import FishSpeechModel
+            # self.model = FishSpeechModel.load(self.model_path)
+            self.is_initialized = True
+            logger.info("Fish Speech model initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize Fish Speech model: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Could not load Fish Speech model: {str(e)}")
+    def synthesize_vocals(
+        self,
+        lyrics: str,
+        duration: int = 30,
+        sample_rate: int = 44100
+    ) -> str:
+        """
+        Synthesize vocals from lyrics
+        Args:
+            lyrics: Lyrics text to sing
+            duration: Target duration in seconds
+            sample_rate: Audio sample rate
+        Returns:
+            Path to generated vocals file
+        """
+        try:
+            self._initialize_model()
+            logger.info(f"Synthesizing vocals: {len(lyrics)} characters")
+            # TODO: Replace with actual Fish Speech synthesis
+            # vocals = self.model.synthesize(lyrics, duration=duration, sample_rate=sample_rate)
+            # Placeholder: Generate silence
+            vocals = np.zeros(int(duration * sample_rate), dtype=np.float32)
+            # Save to file
+            output_dir = os.path.join('outputs', 'vocals')
+            os.makedirs(output_dir, exist_ok=True)
+            vocals_id = str(uuid.uuid4())
+            output_path = os.path.join(output_dir, f"{vocals_id}.wav")
+            sf.write(output_path, vocals, sample_rate)
+            logger.info(f"Vocals synthesized: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Vocal synthesis failed: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Failed to synthesize vocals: {str(e)}")
+    def add_vocals(
+        self,
+        music_path: str,
+        lyrics: str,
+        duration: int = 30
+    ) -> str:
+        """
+        Add synthesized vocals to music track
+        Args:
+            music_path: Path to music audio file
+            lyrics: Lyrics to sing
+            duration: Duration in seconds
+        Returns:
+            Path to mixed audio file
+        """
+        try:
+            logger.info(f"Adding vocals to music: {music_path}")
+            # Load music
+            music_audio, sr = sf.read(music_path)
+            # Synthesize vocals
+            vocals_path = self.synthesize_vocals(lyrics, duration, sr)
+            vocals_audio, _ = sf.read(vocals_path)
+            # Mix vocals with music
+            # Ensure same length
+            min_len = min(len(music_audio), len(vocals_audio))
+            mixed = music_audio[:min_len] * 0.7 + vocals_audio[:min_len] * 0.3
+            # Save mixed audio
+            output_dir = os.path.join('outputs', 'mixed')
+            os.makedirs(output_dir, exist_ok=True)
+            mixed_id = str(uuid.uuid4())
+            output_path = os.path.join(output_dir, f"{mixed_id}.wav")
+            sf.write(output_path, mixed, sr)
+            logger.info(f"Vocals added successfully: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Adding vocals failed: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Failed to add vocals: {str(e)}")

backend/services/lyricmind_service.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+LyricMind AI lyrics generation service
+"""
+import os
+import logging
+import torch
+from typing import Optional
+logger = logging.getLogger(__name__)
+class LyricMindService:
+    """Service for LyricMind AI lyrics generation"""
+    def __init__(self, model_path: str):
+        """
+        Initialize LyricMind service
+        Args:
+            model_path: Path to LyricMind model files
+        """
+        self.model_path = model_path
+        self.model = None
+        self.tokenizer = None
+        self.is_initialized = False
+        self.device = self._get_device()
+        logger.info(f"LyricMind service created with model path: {model_path}")
+        logger.info(f"Using device: {self.device}")
+    def _get_device(self):
+        """Get compute device (AMD GPU via DirectML or CPU)"""
+        try:
+            from utils.amd_gpu import DEFAULT_DEVICE
+            return DEFAULT_DEVICE
+        except:
+            return torch.device("cpu")
+    def _initialize_model(self):
+        """Lazy load the model when first needed"""
+        if self.is_initialized:
+            return
+        try:
+            logger.info("Initializing LyricMind model...")
+            # Try to load text generation model as fallback
+            try:
+                from transformers import AutoTokenizer, AutoModelForCausalLM
+                fallback_path = os.path.join(os.path.dirname(self.model_path), "text_generator")
+                if os.path.exists(fallback_path):
+                    logger.info(f"Loading text generation model from {fallback_path}")
+                    self.tokenizer = AutoTokenizer.from_pretrained(fallback_path, trust_remote_code=True)
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        fallback_path,
+                        trust_remote_code=True,
+                        torch_dtype=torch.float32  # Use FP32 for AMD GPU compatibility
+                    )
+                    self.model.to(self.device)
+                    logger.info("✅ Text generation model loaded successfully")
+                else:
+                    logger.warning("Text generation model not found, using placeholder")
+            except Exception as e:
+                logger.warning(f"Could not load text model: {str(e)}")
+            self.is_initialized = True
+            logger.info("LyricMind service initialized")
+        except Exception as e:
+            logger.error(f"Failed to initialize LyricMind model: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Could not load LyricMind model: {str(e)}")
+    def generate(
+        self,
+        prompt: str,
+        style: Optional[str] = None,
+        duration: int = 30,
+        prompt_analysis: Optional[dict] = None
+    ) -> str:
+        """
+        Generate lyrics from prompt using analysis context
+        Args:
+            prompt: Description of desired lyrics theme
+            style: Music style (optional, will be detected if not provided)
+            duration: Target song duration (affects lyrics length)
+            prompt_analysis: Pre-computed prompt analysis (optional)
+        Returns:
+            Generated lyrics text
+        """
+        try:
+            self._initialize_model()
+            # Use prompt analysis for better context
+            from utils.prompt_analyzer import PromptAnalyzer
+            if prompt_analysis is None:
+                analysis = PromptAnalyzer.analyze(prompt)
+            else:
+                analysis = prompt_analysis
+            # Use detected genre/style if not explicitly provided
+            effective_style = style or analysis.get('genre', 'pop')
+            mood = analysis.get('mood', 'neutral')
+            logger.info(f"Generating lyrics: prompt='{prompt}', style={effective_style}, mood={mood}")
+            # Try to generate with text model
+            if self.model is not None and self.tokenizer is not None:
+                lyrics = self._generate_with_model(prompt, effective_style, duration, analysis)
+            else:
+                # Fallback: placeholder lyrics
+                lyrics = self._generate_placeholder(prompt, effective_style, duration)
+            logger.info("Lyrics generated successfully")
+            return lyrics
+        except Exception as e:
+            logger.error(f"Lyrics generation failed: {str(e)}", exc_info=True)
+            raise RuntimeError(f"Failed to generate lyrics: {str(e)}")
+    def _generate_with_model(self, prompt: str, style: str, duration: int, analysis: dict) -> str:
+        """
+        Generate lyrics using text generation model with analysis context
+        Args:
+            prompt: Theme prompt
+            style: Music style
+            duration: Duration in seconds
+            analysis: Prompt analysis with genre, mood, etc.
+        Returns:
+            Generated lyrics
+        """
+        try:
+            logger.info("Generating lyrics with AI model...")
+            # Create structured prompt with analysis context
+            mood = analysis.get('mood', 'neutral')
+            bpm = analysis.get('bpm', 120)
+            full_prompt = f"""Write song lyrics in {style} style about: {prompt}
+Mood: {mood}
+Tempo: {bpm} BPM
+Lyrics:
+"""
+            # Tokenize
+            inputs = self.tokenizer(full_prompt, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Calculate max length based on duration
+            max_length = min(200 + inputs["input_ids"].shape[1], 512)
+            # Generate
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_length=max_length,
+                    temperature=0.9,
+                    top_p=0.95,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            # Decode
+            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract lyrics (remove prompt)
+            lyrics = generated_text.split("Lyrics:")[-1].strip()
+            logger.info("✅ AI lyrics generation successful")
+            return lyrics if lyrics else self._generate_placeholder(prompt, style, duration)
+        except Exception as e:
+            logger.error(f"Model generation failed: {str(e)}")
+            return self._generate_placeholder(prompt, style, duration)
+    def _generate_placeholder(
+        self,
+        prompt: str,
+        style: str,
+        duration: int
+    ) -> str:
+        """
+        Generate placeholder lyrics for testing
+        Args:
+            prompt: Theme prompt
+            style: Music style
+            duration: Duration in seconds
+        Returns:
+            Placeholder lyrics
+        """
+        logger.warning("Using placeholder lyrics - LyricMind model not loaded")
+        # Estimate number of lines based on duration
+        lines_per_30s = 8
+        num_lines = int((duration / 30) * lines_per_30s)
+        lyrics_lines = [
+            f"[Verse 1]",
+            f"Theme: {prompt}",
+            f"Style: {style}",
+            "",
+            "[Chorus]",
+            "This is a placeholder",
+            "Generated by LyricMind AI",
+            "Replace with actual model output",
+        ]
+        # Pad to desired length
+        while len(lyrics_lines) < num_lines:
+            lyrics_lines.append("La la la...")
+        return "\n".join(lyrics_lines[:num_lines])

backend/services/mastering_service.py ADDED Viewed

	@@ -0,0 +1,641 @@

+"""
+Audio mastering service with industry-standard presets using Pedalboard
+"""
+import os
+import logging
+import numpy as np
+from pathlib import Path
+from typing import Dict, List, Optional
+import soundfile as sf
+from pedalboard import (
+    Pedalboard,
+    Compressor,
+    Limiter,
+    Gain,
+    HighpassFilter,
+    LowpassFilter,
+    PeakFilter,
+    LowShelfFilter,
+    HighShelfFilter,
+    Reverb,
+    Chorus,
+    Delay
+)
+logger = logging.getLogger(__name__)
+class MasteringPreset:
+    """Mastering preset configuration"""
+    def __init__(self, name: str, description: str, chain: List):
+        self.name = name
+        self.description = description
+        self.chain = chain
+class MasteringService:
+    """Audio mastering and EQ service"""
+    # Industry-standard mastering presets
+    PRESETS = {
+        # Clean/Transparent Presets
+        "clean_master": MasteringPreset(
+            "Clean Master",
+            "Transparent mastering with gentle compression",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                PeakFilter(cutoff_frequency_hz=100, gain_db=-1, q=0.7),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=0.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=10000, gain_db=1.0, q=0.7),
+                Compressor(threshold_db=-12, ratio=2.0, attack_ms=5, release_ms=100),
+                Limiter(threshold_db=-1.0, release_ms=100)
+            ]
+        ),
+        "subtle_warmth": MasteringPreset(
+            "Subtle Warmth",
+            "Gentle low-end enhancement with smooth highs",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=1.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=200, gain_db=0.8, q=0.5),
+                PeakFilter(cutoff_frequency_hz=8000, gain_db=-0.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=1.0, q=0.7),
+                Compressor(threshold_db=-15, ratio=2.5, attack_ms=10, release_ms=150),
+                Limiter(threshold_db=-0.5, release_ms=100)
+            ]
+        ),
+        # Pop/Commercial Presets
+        "modern_pop": MasteringPreset(
+            "Modern Pop",
+            "Radio-ready pop sound with punchy compression",
+            [
+                HighpassFilter(cutoff_frequency_hz=35),
+                PeakFilter(cutoff_frequency_hz=80, gain_db=-1.5, q=0.8),
+                LowShelfFilter(cutoff_frequency_hz=120, gain_db=2.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=2500, gain_db=1.5, q=1.2),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=2.5, q=0.7),
+                Compressor(threshold_db=-10, ratio=4.0, attack_ms=3, release_ms=80),
+                Limiter(threshold_db=-0.3, release_ms=50)
+            ]
+        ),
+        "radio_ready": MasteringPreset(
+            "Radio Ready",
+            "Maximum loudness for commercial radio",
+            [
+                HighpassFilter(cutoff_frequency_hz=40),
+                PeakFilter(cutoff_frequency_hz=60, gain_db=-2.0, q=1.0),
+                LowShelfFilter(cutoff_frequency_hz=150, gain_db=1.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=2.0, q=1.5),
+                PeakFilter(cutoff_frequency_hz=8000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=3.0, q=0.7),
+                Compressor(threshold_db=-8, ratio=6.0, attack_ms=2, release_ms=60),
+                Limiter(threshold_db=-0.1, release_ms=30)
+            ]
+        ),
+        "punchy_commercial": MasteringPreset(
+            "Punchy Commercial",
+            "Aggressive punch for mainstream appeal",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                PeakFilter(cutoff_frequency_hz=100, gain_db=-2.0, q=1.2),
+                LowShelfFilter(cutoff_frequency_hz=200, gain_db=2.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=1000, gain_db=-1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=4000, gain_db=2.5, q=1.5),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=2.0, q=0.8),
+                Compressor(threshold_db=-9, ratio=5.0, attack_ms=1, release_ms=50),
+                Limiter(threshold_db=-0.2, release_ms=40)
+            ]
+        ),
+        # Rock/Alternative Presets
+        "rock_master": MasteringPreset(
+            "Rock Master",
+            "Powerful rock sound with emphasis on mids",
+            [
+                HighpassFilter(cutoff_frequency_hz=35),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=1.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=400, gain_db=1.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=2000, gain_db=2.0, q=1.2),
+                PeakFilter(cutoff_frequency_hz=5000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=8000, gain_db=1.0, q=0.8),
+                Compressor(threshold_db=-12, ratio=3.5, attack_ms=5, release_ms=120),
+                Limiter(threshold_db=-0.5, release_ms=80)
+            ]
+        ),
+        "metal_aggressive": MasteringPreset(
+            "Metal Aggressive",
+            "Heavy, aggressive metal mastering",
+            [
+                HighpassFilter(cutoff_frequency_hz=40),
+                PeakFilter(cutoff_frequency_hz=80, gain_db=-1.5, q=1.0),
+                LowShelfFilter(cutoff_frequency_hz=150, gain_db=2.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=800, gain_db=-1.5, q=1.2),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=3.0, q=1.5),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=2.5, q=0.7),
+                Compressor(threshold_db=-8, ratio=6.0, attack_ms=1, release_ms=50),
+                Limiter(threshold_db=-0.1, release_ms=30)
+            ]
+        ),
+        "indie_rock": MasteringPreset(
+            "Indie Rock",
+            "Lo-fi character with mid presence",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=120, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=1.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=2500, gain_db=2.0, q=1.2),
+                PeakFilter(cutoff_frequency_hz=7000, gain_db=-0.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=0.5, q=0.8),
+                Compressor(threshold_db=-14, ratio=3.0, attack_ms=8, release_ms=150),
+                Limiter(threshold_db=-0.8, release_ms=100)
+            ]
+        ),
+        # Electronic/EDM Presets
+        "edm_club": MasteringPreset(
+            "EDM Club",
+            "Powerful club sound with deep bass",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                LowShelfFilter(cutoff_frequency_hz=80, gain_db=3.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=150, gain_db=2.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=1000, gain_db=-1.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=5000, gain_db=2.0, q=1.2),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=3.0, q=0.7),
+                Compressor(threshold_db=-6, ratio=8.0, attack_ms=0.5, release_ms=40),
+                Limiter(threshold_db=0.0, release_ms=20)
+            ]
+        ),
+        "house_groovy": MasteringPreset(
+            "House Groovy",
+            "Smooth house music with rolling bass",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=2.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=250, gain_db=1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=2000, gain_db=0.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=8000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=2.0, q=0.7),
+                Compressor(threshold_db=-10, ratio=4.0, attack_ms=2, release_ms=60),
+                Limiter(threshold_db=-0.2, release_ms=40)
+            ]
+        ),
+        "techno_dark": MasteringPreset(
+            "Techno Dark",
+            "Dark, pounding techno master",
+            [
+                HighpassFilter(cutoff_frequency_hz=35),
+                PeakFilter(cutoff_frequency_hz=60, gain_db=2.0, q=1.0),
+                LowShelfFilter(cutoff_frequency_hz=120, gain_db=1.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=800, gain_db=-2.0, q=1.5),
+                PeakFilter(cutoff_frequency_hz=4000, gain_db=1.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=-0.5, q=0.8),
+                Compressor(threshold_db=-8, ratio=6.0, attack_ms=1, release_ms=50),
+                Limiter(threshold_db=-0.1, release_ms=30)
+            ]
+        ),
+        "dubstep_heavy": MasteringPreset(
+            "Dubstep Heavy",
+            "Sub-bass focused with crispy highs",
+            [
+                HighpassFilter(cutoff_frequency_hz=20),
+                PeakFilter(cutoff_frequency_hz=50, gain_db=3.5, q=1.2),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=2.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=-2.0, q=1.5),
+                PeakFilter(cutoff_frequency_hz=6000, gain_db=2.5, q=1.2),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=3.5, q=0.7),
+                Compressor(threshold_db=-6, ratio=10.0, attack_ms=0.3, release_ms=30),
+                Limiter(threshold_db=0.0, release_ms=20)
+            ]
+        ),
+        # Hip-Hop/R&B Presets
+        "hiphop_modern": MasteringPreset(
+            "Hip-Hop Modern",
+            "Contemporary hip-hop with deep bass",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                LowShelfFilter(cutoff_frequency_hz=80, gain_db=2.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=150, gain_db=1.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=1000, gain_db=-1.0, q=1.0),
+                PeakFilter(cutoff_frequency_hz=3500, gain_db=2.0, q=1.2),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.5, q=0.7),
+                Compressor(threshold_db=-10, ratio=4.0, attack_ms=5, release_ms=80),
+                Limiter(threshold_db=-0.3, release_ms=60)
+            ]
+        ),
+        "trap_808": MasteringPreset(
+            "Trap 808",
+            "808-focused trap mastering",
+            [
+                HighpassFilter(cutoff_frequency_hz=20),
+                PeakFilter(cutoff_frequency_hz=50, gain_db=3.0, q=1.0),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=2.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=800, gain_db=-1.5, q=1.2),
+                PeakFilter(cutoff_frequency_hz=5000, gain_db=2.5, q=1.2),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=2.0, q=0.7),
+                Compressor(threshold_db=-8, ratio=5.0, attack_ms=3, release_ms=60),
+                Limiter(threshold_db=-0.2, release_ms=40)
+            ]
+        ),
+        "rnb_smooth": MasteringPreset(
+            "R&B Smooth",
+            "Silky smooth R&B sound",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=1.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=300, gain_db=1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=2000, gain_db=0.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=6000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=2.0, q=0.7),
+                Compressor(threshold_db=-12, ratio=3.0, attack_ms=8, release_ms=120),
+                Limiter(threshold_db=-0.5, release_ms=80)
+            ]
+        ),
+        # Acoustic/Organic Presets
+        "acoustic_natural": MasteringPreset(
+            "Acoustic Natural",
+            "Natural, transparent acoustic sound",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=0.8, q=0.8),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=1.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=8000, gain_db=1.5, q=0.7),
+                Compressor(threshold_db=-16, ratio=2.0, attack_ms=15, release_ms=200),
+                Limiter(threshold_db=-1.0, release_ms=120)
+            ]
+        ),
+        "folk_warm": MasteringPreset(
+            "Folk Warm",
+            "Warm, intimate folk sound",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=150, gain_db=1.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=400, gain_db=1.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=2500, gain_db=1.0, q=1.0),
+                PeakFilter(cutoff_frequency_hz=7000, gain_db=-0.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.0, q=0.8),
+                Compressor(threshold_db=-18, ratio=2.5, attack_ms=20, release_ms=250),
+                Limiter(threshold_db=-1.5, release_ms=150)
+            ]
+        ),
+        "jazz_vintage": MasteringPreset(
+            "Jazz Vintage",
+            "Classic jazz warmth and space",
+            [
+                HighpassFilter(cutoff_frequency_hz=35),
+                LowShelfFilter(cutoff_frequency_hz=120, gain_db=1.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=2000, gain_db=0.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=8000, gain_db=-1.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=0.5, q=0.8),
+                Compressor(threshold_db=-20, ratio=2.0, attack_ms=25, release_ms=300),
+                Limiter(threshold_db=-2.0, release_ms=180)
+            ]
+        ),
+        # Classical/Orchestral Presets
+        "orchestral_wide": MasteringPreset(
+            "Orchestral Wide",
+            "Wide, natural orchestral sound",
+            [
+                HighpassFilter(cutoff_frequency_hz=20),
+                LowShelfFilter(cutoff_frequency_hz=80, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=300, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=4000, gain_db=0.8, q=0.8),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.0, q=0.7),
+                Compressor(threshold_db=-24, ratio=1.5, attack_ms=30, release_ms=400),
+                Limiter(threshold_db=-3.0, release_ms=250)
+            ]
+        ),
+        "classical_concert": MasteringPreset(
+            "Classical Concert",
+            "Concert hall ambience and dynamics",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                PeakFilter(cutoff_frequency_hz=200, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=1000, gain_db=0.3, q=0.8),
+                PeakFilter(cutoff_frequency_hz=6000, gain_db=0.8, q=0.8),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=0.5, q=0.7),
+                Compressor(threshold_db=-30, ratio=1.2, attack_ms=50, release_ms=500),
+                Limiter(threshold_db=-4.0, release_ms=300)
+            ]
+        ),
+        # Ambient/Atmospheric Presets
+        "ambient_spacious": MasteringPreset(
+            "Ambient Spacious",
+            "Wide, spacious ambient master",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=-0.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=0.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=8000, gain_db=1.5, q=0.7),
+                Compressor(threshold_db=-20, ratio=2.0, attack_ms=50, release_ms=400),
+                Limiter(threshold_db=-2.0, release_ms=200)
+            ]
+        ),
+        "cinematic_epic": MasteringPreset(
+            "Cinematic Epic",
+            "Big, powerful cinematic sound",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=2.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=250, gain_db=1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=2000, gain_db=1.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=6000, gain_db=2.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=2.5, q=0.7),
+                Compressor(threshold_db=-14, ratio=3.0, attack_ms=10, release_ms=150),
+                Limiter(threshold_db=-0.5, release_ms=100)
+            ]
+        ),
+        # Vintage/Lo-Fi Presets
+        "lofi_chill": MasteringPreset(
+            "Lo-Fi Chill",
+            "Vintage lo-fi character",
+            [
+                HighpassFilter(cutoff_frequency_hz=50),
+                LowpassFilter(cutoff_frequency_hz=10000),
+                LowShelfFilter(cutoff_frequency_hz=150, gain_db=1.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=800, gain_db=-1.0, q=1.2),
+                PeakFilter(cutoff_frequency_hz=4000, gain_db=-1.5, q=1.0),
+                Compressor(threshold_db=-12, ratio=3.0, attack_ms=15, release_ms=180),
+                Limiter(threshold_db=-1.0, release_ms=120)
+            ]
+        ),
+        "vintage_vinyl": MasteringPreset(
+            "Vintage Vinyl",
+            "Classic vinyl record warmth",
+            [
+                HighpassFilter(cutoff_frequency_hz=40),
+                LowpassFilter(cutoff_frequency_hz=12000),
+                LowShelfFilter(cutoff_frequency_hz=120, gain_db=2.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=1000, gain_db=-0.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=5000, gain_db=-1.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=8000, gain_db=-1.5, q=0.8),
+                Compressor(threshold_db=-16, ratio=2.5, attack_ms=20, release_ms=200),
+                Limiter(threshold_db=-1.5, release_ms=150)
+            ]
+        ),
+        "retro_80s": MasteringPreset(
+            "Retro 80s",
+            "80s digital warmth and punch",
+            [
+                HighpassFilter(cutoff_frequency_hz=35),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=1.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=800, gain_db=1.0, q=1.0),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=2.0, q=1.2),
+                PeakFilter(cutoff_frequency_hz=8000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.0, q=0.8),
+                Compressor(threshold_db=-10, ratio=4.0, attack_ms=5, release_ms=100),
+                Limiter(threshold_db=-0.5, release_ms=80)
+            ]
+        ),
+        # Specialized Presets
+        "vocal_focused": MasteringPreset(
+            "Vocal Focused",
+            "Emphasizes vocal clarity and presence",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                PeakFilter(cutoff_frequency_hz=200, gain_db=-1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=1000, gain_db=1.0, q=1.0),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=2.5, q=1.2),
+                PeakFilter(cutoff_frequency_hz=5000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.0, q=0.7),
+                Compressor(threshold_db=-12, ratio=3.0, attack_ms=5, release_ms=100),
+                Limiter(threshold_db=-0.5, release_ms=80)
+            ]
+        ),
+        "bass_heavy": MasteringPreset(
+            "Bass Heavy",
+            "Maximum low-end power",
+            [
+                HighpassFilter(cutoff_frequency_hz=20),
+                LowShelfFilter(cutoff_frequency_hz=60, gain_db=4.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=100, gain_db=2.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=-1.5, q=1.0),
+                PeakFilter(cutoff_frequency_hz=4000, gain_db=1.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.5, q=0.7),
+                Compressor(threshold_db=-10, ratio=4.0, attack_ms=10, release_ms=100),
+                Limiter(threshold_db=-0.3, release_ms=60)
+            ]
+        ),
+        "bright_airy": MasteringPreset(
+            "Bright & Airy",
+            "Crystal clear highs with airiness",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=-0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=-1.0, q=0.8),
+                PeakFilter(cutoff_frequency_hz=5000, gain_db=2.0, q=1.0),
+                PeakFilter(cutoff_frequency_hz=10000, gain_db=2.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=12000, gain_db=3.0, q=0.7),
+                Compressor(threshold_db=-14, ratio=2.5, attack_ms=8, release_ms=120),
+                Limiter(threshold_db=-0.8, release_ms=100)
+            ]
+        ),
+        "midrange_punch": MasteringPreset(
+            "Midrange Punch",
+            "Powerful mids for presence",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=2.0, q=1.0),
+                PeakFilter(cutoff_frequency_hz=1500, gain_db=2.5, q=1.2),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=2.0, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=8000, gain_db=0.5, q=0.7),
+                Compressor(threshold_db=-11, ratio=3.5, attack_ms=5, release_ms=90),
+                Limiter(threshold_db=-0.5, release_ms=70)
+            ]
+        ),
+        "dynamic_range": MasteringPreset(
+            "Dynamic Range",
+            "Preserves maximum dynamics",
+            [
+                HighpassFilter(cutoff_frequency_hz=25),
+                PeakFilter(cutoff_frequency_hz=100, gain_db=-0.5, q=0.7),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=0.5, q=0.8),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.0, q=0.7),
+                Compressor(threshold_db=-20, ratio=1.5, attack_ms=20, release_ms=250),
+                Limiter(threshold_db=-2.0, release_ms=200)
+            ]
+        ),
+        "streaming_optimized": MasteringPreset(
+            "Streaming Optimized",
+            "Optimized for streaming platforms (Spotify, Apple Music)",
+            [
+                HighpassFilter(cutoff_frequency_hz=30),
+                LowShelfFilter(cutoff_frequency_hz=100, gain_db=1.0, q=0.7),
+                PeakFilter(cutoff_frequency_hz=500, gain_db=0.5, q=0.8),
+                PeakFilter(cutoff_frequency_hz=3000, gain_db=1.5, q=1.0),
+                HighShelfFilter(cutoff_frequency_hz=10000, gain_db=1.5, q=0.7),
+                Compressor(threshold_db=-14, ratio=3.0, attack_ms=5, release_ms=100),
+                Limiter(threshold_db=-1.0, release_ms=100)
+            ]
+        )
+    }
+    def __init__(self):
+        """Initialize mastering service"""
+        logger.info("Mastering service initialized with 32 presets")
+    def apply_preset(self, audio_path: str, preset_name: str, output_path: str) -> str:
+        """
+        Apply mastering preset to audio file
+        Args:
+            audio_path: Path to input audio file
+            preset_name: Name of preset to apply
+            output_path: Path to save processed audio
+        Returns:
+            Path to processed audio file
+        """
+        try:
+            if preset_name not in self.PRESETS:
+                raise ValueError(f"Unknown preset: {preset_name}")
+            preset = self.PRESETS[preset_name]
+            logger.info(f"Applying preset '{preset.name}' to {audio_path}")
+            # Load audio
+            audio, sr = sf.read(audio_path)
+            # Ensure stereo
+            if len(audio.shape) == 1:
+                audio = np.stack([audio, audio], axis=1)
+            # Create pedalboard with preset chain
+            board = Pedalboard(preset.chain)
+            # Process audio
+            processed = board(audio.T, sr)
+            # Save processed audio
+            sf.write(output_path, processed.T, sr)
+            logger.info(f"Saved mastered audio to {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Error applying preset: {str(e)}", exc_info=True)
+            raise
+    def apply_custom_eq(
+        self,
+        audio_path: str,
+        output_path: str,
+        eq_bands: List[Dict],
+        compression: Optional[Dict] = None,
+        limiting: Optional[Dict] = None
+    ) -> str:
+        """
+        Apply custom EQ settings to audio file
+        Args:
+            audio_path: Path to input audio file
+            output_path: Path to save processed audio
+            eq_bands: List of EQ band settings
+            compression: Compression settings (optional)
+            limiting: Limiter settings (optional)
+        Returns:
+            Path to processed audio file
+        """
+        try:
+            logger.info(f"Applying custom EQ to {audio_path}")
+            # Load audio
+            audio, sr = sf.read(audio_path)
+            # Ensure stereo
+            if len(audio.shape) == 1:
+                audio = np.stack([audio, audio], axis=1)
+            # Build processing chain
+            chain = []
+            # Add EQ bands
+            for band in eq_bands:
+                band_type = band.get('type', 'peak')
+                freq = band.get('frequency', 1000)
+                gain = band.get('gain', 0)
+                q = band.get('q', 1.0)
+                if band_type == 'highpass':
+                    chain.append(HighpassFilter(cutoff_frequency_hz=freq))
+                elif band_type == 'lowpass':
+                    chain.append(LowpassFilter(cutoff_frequency_hz=freq))
+                elif band_type == 'lowshelf':
+                    chain.append(LowShelfFilter(cutoff_frequency_hz=freq, gain_db=gain, q=q))
+                elif band_type == 'highshelf':
+                    chain.append(HighShelfFilter(cutoff_frequency_hz=freq, gain_db=gain, q=q))
+                else:  # peak
+                    chain.append(PeakFilter(cutoff_frequency_hz=freq, gain_db=gain, q=q))
+            # Add compression if specified
+            if compression:
+                chain.append(Compressor(
+                    threshold_db=compression.get('threshold', -12),
+                    ratio=compression.get('ratio', 2.0),
+                    attack_ms=compression.get('attack', 5),
+                    release_ms=compression.get('release', 100)
+                ))
+            # Add limiting if specified
+            if limiting:
+                chain.append(Limiter(
+                    threshold_db=limiting.get('threshold', -1.0),
+                    release_ms=limiting.get('release', 100)
+                ))
+            # Create and apply pedalboard
+            board = Pedalboard(chain)
+            processed = board(audio.T, sr)
+            # Save processed audio
+            sf.write(output_path, processed.T, sr)
+            logger.info(f"Saved custom EQ audio to {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Error applying custom EQ: {str(e)}", exc_info=True)
+            raise
+    def get_preset_list(self) -> List[Dict]:
+        """Get list of available presets with descriptions"""
+        return [
+            {
+                'id': key,
+                'name': preset.name,
+                'description': preset.description
+            }
+            for key, preset in self.PRESETS.items()
+        ]

backend/services/style_consistency_service.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Style Consistency Service
+Uses audio feature extraction and style embeddings to ensure consistent generation
+"""
+import os
+import logging
+import numpy as np
+import librosa
+import soundfile as sf
+from pathlib import Path
+from typing import List, Optional, Dict, Tuple
+import torch
+logger = logging.getLogger(__name__)
+class StyleConsistencyService:
+    """
+    Ensures style consistency across generated clips by analyzing existing audio
+    and providing style guidance for new generations
+    """
+    def __init__(self):
+        self.sample_rate = 44100
+        logger.info("Style Consistency Service initialized")
+    def extract_audio_features(self, audio_path: str) -> Dict[str, np.ndarray]:
+        """
+        Extract comprehensive audio features for style analysis
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Dictionary of extracted features
+        """
+        try:
+            # Load audio
+            audio, sr = librosa.load(audio_path, sr=self.sample_rate)
+            # Extract features
+            features = {}
+            # Spectral features
+            features['mel_spectrogram'] = librosa.feature.melspectrogram(
+                y=audio, sr=sr, n_mels=128, n_fft=2048, hop_length=512
+            )
+            features['spectral_centroid'] = librosa.feature.spectral_centroid(
+                y=audio, sr=sr, n_fft=2048, hop_length=512
+            )
+            features['spectral_bandwidth'] = librosa.feature.spectral_bandwidth(
+                y=audio, sr=sr, n_fft=2048, hop_length=512
+            )
+            features['spectral_contrast'] = librosa.feature.spectral_contrast(
+                y=audio, sr=sr, n_fft=2048, hop_length=512, n_bands=6
+            )
+            features['spectral_rolloff'] = librosa.feature.spectral_rolloff(
+                y=audio, sr=sr, n_fft=2048, hop_length=512
+            )
+            # Temporal features
+            features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(
+                audio, frame_length=2048, hop_length=512
+            )
+            features['rms'] = librosa.feature.rms(
+                y=audio, frame_length=2048, hop_length=512
+            )
+            # Harmonic/percussive
+            harmonic, percussive = librosa.effects.hpss(audio)
+            features['harmonic_ratio'] = np.mean(np.abs(harmonic)) / (np.mean(np.abs(audio)) + 1e-10)
+            features['percussive_ratio'] = np.mean(np.abs(percussive)) / (np.mean(np.abs(audio)) + 1e-10)
+            # Chroma features
+            features['chroma'] = librosa.feature.chroma_stft(
+                y=audio, sr=sr, n_chroma=12, n_fft=2048, hop_length=512
+            )
+            # MFCC
+            features['mfcc'] = librosa.feature.mfcc(
+                y=audio, sr=sr, n_mfcc=20
+            )
+            # Tempo and rhythm
+            tempo, beats = librosa.beat.beat_track(y=audio, sr=sr)
+            features['tempo'] = tempo
+            features['beat_frames'] = beats
+            logger.info(f"Extracted features from {audio_path}")
+            return features
+        except Exception as e:
+            logger.error(f"Failed to extract features from {audio_path}: {e}")
+            return {}
+    def compute_style_statistics(self, features: Dict[str, np.ndarray]) -> Dict[str, float]:
+        """
+        Compute statistical summaries of audio features for style matching
+        Args:
+            features: Dictionary of extracted features
+        Returns:
+            Dictionary of style statistics
+        """
+        stats = {}
+        # Compute mean/std for spectral features
+        for key in ['spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff',
+                    'zero_crossing_rate', 'rms']:
+            if key in features:
+                stats[f'{key}_mean'] = float(np.mean(features[key]))
+                stats[f'{key}_std'] = float(np.std(features[key]))
+        # Spectral contrast summary
+        if 'spectral_contrast' in features:
+            stats['spectral_contrast_mean'] = float(np.mean(features['spectral_contrast']))
+            stats['spectral_contrast_std'] = float(np.std(features['spectral_contrast']))
+        # Harmonic/percussive balance
+        stats['harmonic_ratio'] = float(features.get('harmonic_ratio', 0.5))
+        stats['percussive_ratio'] = float(features.get('percussive_ratio', 0.5))
+        # Tempo
+        stats['tempo'] = float(features.get('tempo', 120.0))
+        # Chroma energy distribution
+        if 'chroma' in features:
+            chroma_mean = np.mean(features['chroma'], axis=1)
+            stats['chroma_energy'] = chroma_mean.tolist()
+        # MFCC summary (timbre)
+        if 'mfcc' in features:
+            mfcc_mean = np.mean(features['mfcc'], axis=1)
+            stats['timbre_signature'] = mfcc_mean[:13].tolist()  # First 13 MFCCs
+        return stats
+    def analyze_timeline_style(self, clip_paths: List[str]) -> Dict[str, any]:
+        """
+        Analyze style characteristics of all clips on timeline
+        Args:
+            clip_paths: List of audio file paths from timeline
+        Returns:
+            Aggregate style profile
+        """
+        if not clip_paths:
+            return {}
+        all_features = []
+        all_stats = []
+        for path in clip_paths:
+            if os.path.exists(path):
+                features = self.extract_audio_features(path)
+                if features:
+                    stats = self.compute_style_statistics(features)
+                    all_features.append(features)
+                    all_stats.append(stats)
+        if not all_stats:
+            return {}
+        # Aggregate statistics across all clips
+        aggregate_style = {}
+        # Average numerical features
+        numeric_keys = [k for k in all_stats[0].keys() if isinstance(all_stats[0][k], (int, float))]
+        for key in numeric_keys:
+            values = [stats[key] for stats in all_stats if key in stats]
+            aggregate_style[key] = float(np.mean(values))
+        # Average chroma and timbre
+        if 'chroma_energy' in all_stats[0]:
+            chroma_arrays = [np.array(stats['chroma_energy']) for stats in all_stats if 'chroma_energy' in stats]
+            if chroma_arrays:
+                aggregate_style['chroma_energy'] = np.mean(chroma_arrays, axis=0).tolist()
+        if 'timbre_signature' in all_stats[0]:
+            timbre_arrays = [np.array(stats['timbre_signature']) for stats in all_stats if 'timbre_signature' in stats]
+            if timbre_arrays:
+                aggregate_style['timbre_signature'] = np.mean(timbre_arrays, axis=0).tolist()
+        logger.info(f"Analyzed style from {len(clip_paths)} clips")
+        return aggregate_style
+    def create_style_reference_audio(self, clip_paths: List[str], output_path: str) -> str:
+        """
+        Mix all timeline clips into a single reference audio for style guidance
+        Args:
+            clip_paths: List of audio file paths
+            output_path: Where to save the reference audio
+        Returns:
+            Path to created reference audio
+        """
+        if not clip_paths:
+            raise ValueError("No clips provided for style reference")
+        try:
+            # Load all clips and find max duration
+            clips_audio = []
+            max_length = 0
+            for path in clip_paths:
+                if os.path.exists(path):
+                    audio, sr = librosa.load(path, sr=self.sample_rate)
+                    clips_audio.append(audio)
+                    max_length = max(max_length, len(audio))
+            if not clips_audio:
+                raise ValueError("No valid audio files found")
+            # Pad all clips to same length
+            padded_clips = []
+            for audio in clips_audio:
+                if len(audio) < max_length:
+                    audio = np.pad(audio, (0, max_length - len(audio)))
+                padded_clips.append(audio)
+            # Mix clips (average them)
+            mixed_audio = np.mean(padded_clips, axis=0)
+            # Normalize
+            mixed_audio = librosa.util.normalize(mixed_audio)
+            # Save reference audio
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            sf.write(output_path, mixed_audio, self.sample_rate)
+            logger.info(f"Created style reference audio: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Failed to create style reference: {e}")
+            raise
+    def enhance_prompt_with_style(
+        self,
+        base_prompt: str,
+        style_profile: Dict[str, any]
+    ) -> str:
+        """
+        Enhance generation prompt with style characteristics
+        Args:
+            base_prompt: User's original prompt
+            style_profile: Style analysis from timeline
+        Returns:
+            Enhanced prompt
+        """
+        if not style_profile:
+            return base_prompt
+        style_descriptors = []
+        # Tempo descriptor
+        tempo = style_profile.get('tempo', 120)
+        if tempo < 90:
+            style_descriptors.append("slow tempo")
+        elif tempo > 140:
+            style_descriptors.append("fast tempo")
+        # Energy/dynamics descriptor
+        rms_mean = style_profile.get('rms_mean', 0.1)
+        if rms_mean > 0.15:
+            style_descriptors.append("energetic")
+        elif rms_mean < 0.08:
+            style_descriptors.append("gentle")
+        # Harmonic/percussive balance
+        harmonic_ratio = style_profile.get('harmonic_ratio', 0.5)
+        percussive_ratio = style_profile.get('percussive_ratio', 0.5)
+        if harmonic_ratio > percussive_ratio * 1.3:
+            style_descriptors.append("melodic")
+        elif percussive_ratio > harmonic_ratio * 1.3:
+            style_descriptors.append("rhythmic")
+        # Spectral brightness
+        centroid_mean = style_profile.get('spectral_centroid_mean', 2000)
+        if centroid_mean > 3000:
+            style_descriptors.append("bright")
+        elif centroid_mean < 1500:
+            style_descriptors.append("warm")
+        # Combine with base prompt
+        if style_descriptors:
+            enhanced = f"{base_prompt}, consistent with existing style: {', '.join(style_descriptors)}"
+            logger.info(f"Enhanced prompt: {enhanced}")
+            return enhanced
+        return base_prompt
+    def get_style_guidance_for_generation(
+        self,
+        timeline_clips: List[Dict]
+    ) -> Tuple[Optional[str], Dict[str, any]]:
+        """
+        Prepare style guidance for new generation
+        Args:
+            timeline_clips: List of clip dictionaries from timeline
+        Returns:
+            Tuple of (reference_audio_path, style_profile)
+        """
+        if not timeline_clips:
+            logger.info("No existing clips - no style guidance available")
+            return None, {}
+        # Get audio paths from clips
+        clip_paths = []
+        for clip in timeline_clips:
+            audio_path = clip.get('music_path') or clip.get('mixed_path') or clip.get('file_path')
+            if audio_path and os.path.exists(audio_path):
+                clip_paths.append(audio_path)
+        if not clip_paths:
+            return None, {}
+        # Analyze timeline style
+        style_profile = self.analyze_timeline_style(clip_paths)
+        # Create reference audio (mix of all clips)
+        try:
+            ref_dir = os.path.join('outputs', 'style_reference')
+            os.makedirs(ref_dir, exist_ok=True)
+            ref_path = os.path.join(ref_dir, 'timeline_reference.wav')
+            reference_audio = self.create_style_reference_audio(clip_paths, ref_path)
+            logger.info(f"Style guidance ready: {len(clip_paths)} clips analyzed")
+            return reference_audio, style_profile
+        except Exception as e:
+            logger.error(f"Failed to create reference audio: {e}")
+            return None, style_profile

backend/services/timeline_service.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Timeline management service
+"""
+import logging
+from typing import List, Dict, Optional
+from models.schemas import ClipPosition, TimelineClip
+logger = logging.getLogger(__name__)
+class TimelineService:
+    """Service for managing timeline clips"""
+    def __init__(self):
+        """Initialize timeline service"""
+        self.clips: List[TimelineClip] = []
+        logger.info("Timeline service initialized")
+    def add_clip(
+        self,
+        clip_id: str,
+        file_path: str,
+        duration: float,
+        position: ClipPosition
+    ) -> Dict:
+        """
+        Add a clip to the timeline
+        Args:
+            clip_id: Unique clip identifier
+            file_path: Path to audio file
+            duration: Clip duration in seconds
+            position: Where to place the clip
+        Returns:
+            Clip information with timeline position
+        """
+        try:
+            # Calculate timeline position based on requested position
+            if position == ClipPosition.INTRO:
+                timeline_position = 0
+                start_time = 0.0
+                # Shift all existing clips
+                for clip in self.clips:
+                    clip.timeline_position += 1
+                    clip.start_time += duration
+            elif position == ClipPosition.PREVIOUS:
+                if len(self.clips) == 0:
+                    timeline_position = 0
+                    start_time = 0.0
+                else:
+                    timeline_position = len(self.clips) - 1
+                    start_time = self.clips[-1].start_time
+                    # Shift last clip
+                    self.clips[-1].timeline_position += 1
+                    self.clips[-1].start_time += duration
+            elif position == ClipPosition.NEXT:
+                timeline_position = len(self.clips)
+                start_time = self.get_total_duration()
+            else:  # OUTRO
+                timeline_position = len(self.clips)
+                start_time = self.get_total_duration()
+            # Create clip
+            clip = TimelineClip(
+                clip_id=clip_id,
+                file_path=file_path,
+                duration=duration,
+                timeline_position=timeline_position,
+                start_time=start_time,
+                music_path=file_path  # Store as music_path for consistent access
+            )
+            # Insert clip at correct position
+            self.clips.insert(timeline_position, clip)
+            logger.info(f"Clip added: {clip_id} at position {timeline_position}")
+            return {
+                'clip_id': clip_id,
+                'timeline_position': timeline_position,
+                'start_time': start_time,
+                'duration': duration
+            }
+        except Exception as e:
+            logger.error(f"Failed to add clip: {str(e)}", exc_info=True)
+            raise
+    def remove_clip(self, clip_id: str):
+        """
+        Remove a clip from timeline
+        Args:
+            clip_id: Clip to remove
+        """
+        try:
+            # Find and remove clip
+            clip_index = None
+            for i, clip in enumerate(self.clips):
+                if clip.clip_id == clip_id:
+                    clip_index = i
+                    break
+            if clip_index is None:
+                raise ValueError(f"Clip not found: {clip_id}")
+            removed_clip = self.clips.pop(clip_index)
+            # Recalculate positions
+            self._recalculate_positions()
+            logger.info(f"Clip removed: {clip_id}")
+        except Exception as e:
+            logger.error(f"Failed to remove clip: {str(e)}", exc_info=True)
+            raise
+    def reorder_clips(self, clip_ids: List[str]):
+        """
+        Reorder clips on timeline
+        Args:
+            clip_ids: New order of clip IDs
+        """
+        try:
+            # Validate all clip IDs exist
+            existing_ids = {clip.clip_id for clip in self.clips}
+            requested_ids = set(clip_ids)
+            if existing_ids != requested_ids:
+                raise ValueError("Clip IDs don't match existing clips")
+            # Create new order
+            clip_dict = {clip.clip_id: clip for clip in self.clips}
+            self.clips = [clip_dict[cid] for cid in clip_ids]
+            # Recalculate positions
+            self._recalculate_positions()
+            logger.info("Clips reordered")
+        except Exception as e:
+            logger.error(f"Failed to reorder clips: {str(e)}", exc_info=True)
+            raise
+    def get_all_clips(self) -> List[Dict]:
+        """Get all clips with their information"""
+        return [
+            {
+                'clip_id': clip.clip_id,
+                'file_path': clip.file_path,
+                'duration': clip.duration,
+                'timeline_position': clip.timeline_position,
+                'start_time': clip.start_time
+            }
+            for clip in self.clips
+        ]
+    def get_clip(self, clip_id: str) -> Optional[TimelineClip]:
+        """Get a specific clip"""
+        for clip in self.clips:
+            if clip.clip_id == clip_id:
+                return clip
+        return None
+    def get_total_duration(self) -> float:
+        """Get total duration of all clips"""
+        if not self.clips:
+            return 0.0
+        return sum(clip.duration for clip in self.clips)
+    def clear(self):
+        """Clear all clips from timeline"""
+        self.clips = []
+        logger.info("Timeline cleared")
+    def _recalculate_positions(self):
+        """Recalculate all clip positions and start times"""
+        current_time = 0.0
+        for i, clip in enumerate(self.clips):
+            clip.timeline_position = i
+            clip.start_time = current_time
+            current_time += clip.duration

backend/start_with_env.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Wrapper script to start the backend with required environment variables.
+This is used by the PowerShell launcher to ensure environment variables are set.
+"""
+import os
+import sys
+import subprocess
+from pathlib import Path
+# Get project root (parent of backend directory)
+project_root = Path(__file__).parent.parent
+# Set required environment variables
+os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = str(project_root / 'external' / 'espeak-ng' / 'libespeak-ng.dll')
+os.environ['PHONEMIZER_ESPEAK_PATH'] = str(project_root / 'external' / 'espeak-ng')
+# Run the backend run.py script
+backend_script = project_root / 'backend' / 'run.py'
+# Execute run.py in the same interpreter
+exec(open(backend_script).read())

backend/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Utilities package"""
+from .logger import setup_logger
+from .validators import validate_generation_params, validate_clip_data
+__all__ = ['setup_logger', 'validate_generation_params', 'validate_clip_data']

backend/utils/amd_gpu.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+AMD GPU Detection and Configuration
+Enables DirectML support for AMD GPUs (including Vega 8)
+Note: DirectML may not be compatible with Python 3.13+ - CPU fallback will be used
+"""
+import os
+import logging
+import torch
+logger = logging.getLogger(__name__)
+def setup_amd_gpu():
+    """
+    Configure DirectML for AMD GPU support
+    Returns device to use for model inference
+    """
+    try:
+        # Check if torch-directml is available
+        try:
+            import torch_directml
+            # Get DirectML device
+            if torch_directml.is_available():
+                device = torch_directml.device()
+                logger.info(f"✅ AMD GPU detected via DirectML")
+                logger.info(f"Device: {device}")
+                # Set default device
+                torch.set_default_device(device)
+                return device
+            else:
+                logger.warning("DirectML available but no compatible GPU found")
+                return torch.device("cpu")
+        except ImportError:
+            logger.warning("torch-directml not available (may not support Python 3.13+)")
+            logger.info("Using CPU mode - consider Python 3.11 for DirectML support")
+            return torch.device("cpu")
+    except Exception as e:
+        logger.error(f"Error setting up AMD GPU: {str(e)}")
+        return torch.device("cpu")
+def get_device_info():
+    """Get detailed information about available compute devices"""
+    info = {
+        "device": "cpu",
+        "device_name": "CPU",
+        "directml_available": False,
+        "cuda_available": torch.cuda.is_available()
+    }
+    try:
+        try:
+            import torch_directml
+            if torch_directml.is_available():
+                info["directml_available"] = True
+                info["device"] = "directml"
+                info["device_name"] = "AMD GPU (DirectML)"
+                # Get device
+                device = torch_directml.device()
+                info["device_object"] = device
+        except ImportError:
+            logger.info("DirectML not available - Python 3.13+ may not support torch-directml")
+            logger.info("For AMD GPU support, consider using Python 3.11 with torch-directml")
+    except Exception as e:
+        logger.error(f"Error getting device info: {str(e)}")
+    return info
+def optimize_for_amd():
+    """Apply optimizations for AMD GPU inference"""
+    try:
+        # Disable CUDA if present (prefer DirectML for AMD)
+        os.environ["CUDA_VISIBLE_DEVICES"] = ""
+        # Set DirectML memory management
+        os.environ["PYTORCH_DIRECTML_FORCE_FP32_OPS"] = "0"  # Allow FP16
+        # Enable TensorFloat-32 for better performance
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        logger.info("✅ AMD GPU optimizations applied")
+    except Exception as e:
+        logger.error(f"Error applying AMD optimizations: {str(e)}")
+# Auto-configure on module import
+DEFAULT_DEVICE = setup_amd_gpu()
+optimize_for_amd()
+logger.info(f"Default compute device: {DEFAULT_DEVICE}")

backend/utils/logger.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Logging configuration
+"""
+import logging
+import os
+from logging.handlers import RotatingFileHandler
+def setup_logger(app):
+    """
+    Configure application logging
+    Args:
+        app: Flask application instance
+    """
+    # Create logs directory
+    log_dir = 'logs'
+    os.makedirs(log_dir, exist_ok=True)
+    # Set log level
+    log_level = getattr(logging, app.config.get('LOG_LEVEL', 'INFO'))
+    # Configure root logger
+    logging.basicConfig(
+        level=log_level,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    # File handler with rotation
+    log_file = app.config.get('LOG_FILE', os.path.join(log_dir, 'app.log'))
+    file_handler = RotatingFileHandler(
+        log_file,
+        maxBytes=10 * 1024 * 1024,  # 10MB
+        backupCount=5
+    )
+    file_handler.setLevel(log_level)
+    file_handler.setFormatter(logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    ))
+    # Console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(log_level)
+    console_handler.setFormatter(logging.Formatter(
+        '%(asctime)s - %(levelname)s - %(message)s'
+    ))
+    # Add handlers to app logger
+    app.logger.addHandler(file_handler)
+    app.logger.addHandler(console_handler)
+    app.logger.setLevel(log_level)
+    # Set library log levels
+    logging.getLogger('werkzeug').setLevel(logging.WARNING)
+    logging.getLogger('urllib3').setLevel(logging.WARNING)
+    app.logger.info("Logging configured successfully")

backend/utils/prompt_analyzer.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+Prompt analysis utility for extracting music attributes
+Analyzes user prompts to extract genre, style, BPM, mood, and other musical attributes
+"""
+import re
+import logging
+from typing import Dict, Optional, List, Any
+logger = logging.getLogger(__name__)
+class PromptAnalyzer:
+    """Analyzes music prompts to extract musical attributes"""
+    # Genre/style keywords
+    GENRES = {
+        'pop': ['pop', 'mainstream', 'catchy', 'radio-friendly'],
+        'rock': ['rock', 'guitar', 'electric', 'distortion', 'power chords'],
+        'hip-hop': ['hip-hop', 'rap', 'trap', 'beats', 'rhymes', 'flow'],
+        'electronic': ['edm', 'electronic', 'synth', 'techno', 'house', 'trance'],
+        'jazz': ['jazz', 'swing', 'bebop', 'saxophone', 'improvisation'],
+        'classical': ['classical', 'orchestra', 'symphony', 'piano', 'strings'],
+        'country': ['country', 'folk', 'acoustic', 'banjo', 'bluegrass'],
+        'r&b': ['r&b', 'soul', 'rnb', 'rhythm and blues', 'groove'],
+        'metal': ['metal', 'heavy', 'headbanging', 'aggressive', 'brutal'],
+        'indie': ['indie', 'alternative', 'underground', 'experimental'],
+        'reggae': ['reggae', 'ska', 'dub', 'jamaican', 'offbeat'],
+        'blues': ['blues', 'twelve bar', 'soulful', 'melancholic']
+    }
+    # BPM keywords and ranges
+    BPM_KEYWORDS = {
+        'slow': (60, 80),
+        'ballad': (60, 80),
+        'moderate': (80, 120),
+        'medium': (90, 110),
+        'upbeat': (120, 140),
+        'fast': (140, 180),
+        'energetic': (130, 150),
+        'intense': (150, 180)
+    }
+    # Mood/emotion keywords
+    MOODS = {
+        'happy': ['happy', 'joyful', 'cheerful', 'uplifting', 'bright'],
+        'sad': ['sad', 'melancholic', 'sorrowful', 'emotional', 'tearful'],
+        'energetic': ['energetic', 'powerful', 'dynamic', 'intense', 'vigorous'],
+        'calm': ['calm', 'peaceful', 'relaxing', 'soothing', 'tranquil'],
+        'dark': ['dark', 'ominous', 'mysterious', 'sinister', 'haunting'],
+        'romantic': ['romantic', 'love', 'passionate', 'tender', 'intimate'],
+        'angry': ['angry', 'aggressive', 'fierce', 'furious', 'rage'],
+        'nostalgic': ['nostalgic', 'reminiscent', 'wistful', 'longing']
+    }
+    # Instrumental keywords
+    INSTRUMENTS = [
+        'guitar', 'piano', 'drums', 'bass', 'synth', 'violin', 'saxophone',
+        'trumpet', 'flute', 'organ', 'keyboard', 'strings', 'brass', 'percussion'
+    ]
+    @classmethod
+    def analyze(cls, prompt: str) -> Dict[str, Any]:
+        """
+        Analyze a music prompt to extract attributes
+        Args:
+            prompt: User's music description
+        Returns:
+            Dictionary containing:
+            - genre: Detected genre(s)
+            - bpm: Estimated BPM or range
+            - mood: Detected mood(s)
+            - instruments: Mentioned instruments
+            - style_tags: Additional style descriptors
+            - analysis_text: Formatted analysis for AI models
+        """
+        if not prompt:
+            return cls._get_default_analysis()
+        prompt_lower = prompt.lower()
+        # Detect genre
+        detected_genres = cls._detect_genres(prompt_lower)
+        # Detect BPM
+        bpm_info = cls._detect_bpm(prompt_lower)
+        # Detect mood
+        detected_moods = cls._detect_moods(prompt_lower)
+        # Detect instruments
+        detected_instruments = cls._detect_instruments(prompt_lower)
+        # Extract additional style tags
+        style_tags = cls._extract_style_tags(prompt_lower)
+        # Build structured analysis
+        analysis = {
+            'genre': detected_genres[0] if detected_genres else 'pop',
+            'genres': detected_genres,
+            'bpm': bpm_info['bpm'],
+            'bpm_range': bpm_info['range'],
+            'mood': detected_moods[0] if detected_moods else 'neutral',
+            'moods': detected_moods,
+            'instruments': detected_instruments,
+            'style_tags': style_tags,
+            'has_vocals': cls._should_have_vocals(prompt_lower),
+            'analysis_text': cls._format_analysis_text(
+                detected_genres, bpm_info, detected_moods, detected_instruments
+            )
+        }
+        logger.info(f"Prompt analysis: genre={analysis['genre']}, bpm={analysis['bpm']}, mood={analysis['mood']}")
+        return analysis
+    @classmethod
+    def _detect_genres(cls, prompt: str) -> List[str]:
+        """Detect genres from prompt"""
+        detected = []
+        for genre, keywords in cls.GENRES.items():
+            if any(keyword in prompt for keyword in keywords):
+                detected.append(genre)
+        return detected[:3]  # Top 3 genres
+    @classmethod
+    def _detect_bpm(cls, prompt: str) -> Dict[str, Any]:
+        """Detect BPM or BPM range from prompt"""
+        # Check for explicit BPM numbers
+        bpm_match = re.search(r'\b(\d{2,3})\s*bpm\b', prompt)
+        if bpm_match:
+            bpm_value = int(bpm_match.group(1))
+            return {
+                'bpm': bpm_value,
+                'range': (bpm_value - 5, bpm_value + 5)
+            }
+        # Check for BPM keywords
+        for keyword, (min_bpm, max_bpm) in cls.BPM_KEYWORDS.items():
+            if keyword in prompt:
+                return {
+                    'bpm': (min_bpm + max_bpm) // 2,
+                    'range': (min_bpm, max_bpm)
+                }
+        # Default: moderate tempo
+        return {'bpm': 120, 'range': (100, 140)}
+    @classmethod
+    def _detect_moods(cls, prompt: str) -> List[str]:
+        """Detect moods from prompt"""
+        detected = []
+        for mood, keywords in cls.MOODS.items():
+            if any(keyword in prompt for keyword in keywords):
+                detected.append(mood)
+        return detected[:2]  # Top 2 moods
+    @classmethod
+    def _detect_instruments(cls, prompt: str) -> List[str]:
+        """Detect mentioned instruments"""
+        detected = []
+        for instrument in cls.INSTRUMENTS:
+            if instrument in prompt:
+                detected.append(instrument)
+        return detected
+    @classmethod
+    def _extract_style_tags(cls, prompt: str) -> List[str]:
+        """Extract additional style descriptors"""
+        tags = []
+        style_keywords = [
+            'vintage', 'modern', 'retro', 'futuristic', 'minimal', 'complex',
+            'acoustic', 'electric', 'orchestral', 'ambient', 'rhythmic',
+            'melodic', 'harmonic', 'atmospheric', 'driving', 'groovy'
+        ]
+        for tag in style_keywords:
+            if tag in prompt:
+                tags.append(tag)
+        return tags
+    @classmethod
+    def _should_have_vocals(cls, prompt: str) -> bool:
+        """Determine if music should have vocals"""
+        vocal_keywords = ['vocal', 'singing', 'voice', 'lyrics', 'song', 'sung']
+        instrumental_keywords = ['instrumental', 'no vocals', 'no voice', 'without vocals']
+        has_vocal_mention = any(keyword in prompt for keyword in vocal_keywords)
+        has_instrumental_mention = any(keyword in prompt for keyword in instrumental_keywords)
+        # Default to vocals unless explicitly instrumental
+        if has_instrumental_mention:
+            return False
+        return True  # Default to vocals
+    @classmethod
+    def _format_analysis_text(
+        cls,
+        genres: List[str],
+        bpm_info: Dict,
+        moods: List[str],
+        instruments: List[str]
+    ) -> str:
+        """Format analysis into text for AI model context"""
+        parts = []
+        if genres:
+            parts.append(f"Genre: {', '.join(genres)}")
+        if bpm_info.get('bpm'):
+            parts.append(f"BPM: {bpm_info['bpm']}")
+        if moods:
+            parts.append(f"Mood: {', '.join(moods)}")
+        if instruments:
+            parts.append(f"Instruments: {', '.join(instruments)}")
+        return '; '.join(parts) if parts else "General music"
+    @classmethod
+    def _get_default_analysis(cls) -> Dict[str, Any]:
+        """Return default analysis when prompt is empty"""
+        return {
+            'genre': 'pop',
+            'genres': ['pop'],
+            'bpm': 120,
+            'bpm_range': (100, 140),
+            'mood': 'neutral',
+            'moods': [],
+            'instruments': [],
+            'style_tags': [],
+            'has_vocals': True,
+            'analysis_text': 'General pop music at moderate tempo'
+        }
+    @classmethod
+    def format_for_diffrhythm(cls, prompt: str, lyrics: Optional[str] = None, analysis: Optional[Dict] = None) -> str:
+        """
+        Format prompt for DiffRhythm model
+        Args:
+            prompt: Original user prompt
+            lyrics: Optional lyrics
+            analysis: Optional pre-computed analysis
+        Returns:
+            Formatted prompt for DiffRhythm
+        """
+        if analysis is None:
+            analysis = cls.analyze(prompt)
+        parts = [prompt]
+        # Add analysis context
+        if analysis.get('analysis_text'):
+            parts.append(f"[{analysis['analysis_text']}]")
+        # Add lyrics if provided
+        if lyrics:
+            parts.append(f"Lyrics: {lyrics}")
+        return ' '.join(parts)
+    @classmethod
+    def format_for_lyrics_generation(cls, prompt: str, analysis: Optional[Dict] = None) -> str:
+        """
+        Format prompt for lyrics generation
+        Args:
+            prompt: Original user prompt
+            analysis: Optional pre-computed analysis
+        Returns:
+            Formatted prompt for LyricsMind
+        """
+        if analysis is None:
+            analysis = cls.analyze(prompt)
+        genre = analysis.get('genre', 'pop')
+        mood = analysis.get('mood', 'neutral')
+        formatted = f"Write {genre} song lyrics with a {mood} mood about: {prompt}"
+        # Add additional context
+        if analysis.get('style_tags'):
+            formatted += f" (Style: {', '.join(analysis['style_tags'][:2])})"
+        return formatted

backend/utils/validators.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Request validation utilities
+"""
+from typing import Dict, Optional
+def validate_generation_params(data: Dict) -> Optional[str]:
+    """
+    Validate music generation parameters
+    Args:
+        data: Request data dictionary
+    Returns:
+        Error message if validation fails, None otherwise
+    """
+    if not data:
+        return "Request body is required"
+    if 'prompt' not in data:
+        return "Missing required field: prompt"
+    if not data['prompt'] or not data['prompt'].strip():
+        return "Prompt cannot be empty"
+    if 'duration' in data:
+        duration = data['duration']
+        if not isinstance(duration, (int, float)):
+            return "Duration must be a number"
+        if duration < 10 or duration > 120:
+            return "Duration must be between 10 and 120 seconds"
+    if 'use_vocals' in data:
+        if not isinstance(data['use_vocals'], bool):
+            return "use_vocals must be a boolean"
+        if data['use_vocals'] and not data.get('lyrics'):
+            return "Lyrics are required when use_vocals is true"
+    return None
+def validate_clip_data(data: Dict) -> Optional[str]:
+    """
+    Validate timeline clip data
+    Args:
+        data: Clip data dictionary
+    Returns:
+        Error message if validation fails, None otherwise
+    """
+    required_fields = ['clip_id', 'file_path', 'duration', 'position']
+    for field in required_fields:
+        if field not in data:
+            return f"Missing required field: {field}"
+    if not isinstance(data['duration'], (int, float)) or data['duration'] <= 0:
+        return "Duration must be a positive number"
+    valid_positions = ['intro', 'previous', 'next', 'outro']
+    if data['position'] not in valid_positions:
+        return f"Invalid position. Must be one of: {', '.join(valid_positions)}"
+    return None

hf_config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+Configuration for HuggingFace Spaces deployment
+Handles espeak-ng and model paths for cloud environment
+"""
+import os
+from pathlib import Path
+# Detect if running on HuggingFace Spaces
+IS_SPACES = os.getenv("SPACE_ID") is not None
+# Configure espeak-ng for HuggingFace Spaces
+if IS_SPACES:
+    # On Spaces, espeak-ng is installed via packages.txt
+    # It's available system-wide
+    if os.path.exists("/usr/bin/espeak-ng"):
+        os.environ["PHONEMIZER_ESPEAK_PATH"] = "/usr/bin/espeak-ng"
+        if os.path.exists("/usr/lib/x86_64-linux-gnu/libespeak-ng.so"):
+            os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "/usr/lib/x86_64-linux-gnu/libespeak-ng.so"
+        elif os.path.exists("/usr/lib/libespeak-ng.so"):
+            os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "/usr/lib/libespeak-ng.so"
+else:
+    # Local development - use bundled espeak-ng
+    espeak_path = Path(__file__).parent.parent / "external" / "espeak-ng"
+    if espeak_path.exists():
+        os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = str(espeak_path / "libespeak-ng.dll")
+        os.environ["PHONEMIZER_ESPEAK_PATH"] = str(espeak_path)
+print(f"🔧 Environment: {'HuggingFace Spaces' if IS_SPACES else 'Local'}")
+print(f"🔊 PHONEMIZER_ESPEAK_PATH: {os.getenv('PHONEMIZER_ESPEAK_PATH', 'Not set')}")
+print(f"📚 PHONEMIZER_ESPEAK_LIBRARY: {os.getenv('PHONEMIZER_ESPEAK_LIBRARY', 'Not set')}")

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+espeak-ng
+ffmpeg
+libsndfile1

pre_startup.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+# Pre-startup script for HuggingFace Spaces
+# This runs before the main application
+echo "🚀 Initializing Music Generation Studio..."
+# Verify espeak-ng installation
+if command -v espeak-ng &> /dev/null; then
+    echo "✅ espeak-ng is installed"
+    espeak-ng --version
+else
+    echo "❌ espeak-ng not found"
+    exit 1
+fi
+# Verify ffmpeg
+if command -v ffmpeg &> /dev/null; then
+    echo "✅ ffmpeg is installed"
+    ffmpeg -version | head -1
+else
+    echo "❌ ffmpeg not found"
+fi
+# Create necessary directories
+mkdir -p outputs/music
+mkdir -p outputs/mixed
+mkdir -p models
+mkdir -p logs
+echo "✅ Directories created"
+# Check Python version
+python --version
+# Verify key dependencies
+echo "📦 Verifying Python packages..."
+python -c "import torch; print(f'✅ PyTorch {torch.__version__}')" || echo "❌ PyTorch not found"
+python -c "import gradio; print(f'✅ Gradio {gradio.__version__}')" || echo "❌ Gradio not found"
+python -c "import phonemizer; print('✅ phonemizer OK')" || echo "❌ phonemizer not found"
+echo "✅ Pre-startup checks complete"

requirements.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+# Core dependencies for HuggingFace Spaces deployment
+gradio==4.44.0
+numpy>=1.24.0,<2.0.0
+scipy>=1.10.0
+librosa>=0.10.0
+soundfile>=0.12.0
+pydantic>=2.0.0
+pyyaml>=6.0
+# PyTorch - CPU mode for HuggingFace Spaces
+torch>=2.4.0,<2.5.0
+torchaudio>=2.4.0,<2.5.0
+# DiffRhythm2 dependencies
+torchdiffeq>=0.2.4
+phonemizer>=3.2.0
+muq>=0.1.0
+jieba>=0.42.0
+pypinyin>=0.50.0
+cn2an>=0.5.0
+onnxruntime>=1.15.0
+pykakasi>=2.3.0
+unidecode>=1.3.0
+py3langid>=0.2.2
+# AI Model dependencies
+transformers==4.47.1
+diffusers>=0.21.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0,<5.0.0
+accelerate>=0.20.0
+einops>=0.7.0
+omegaconf>=2.3.0
+# Audio processing
+pedalboard>=0.7.0
+pydub>=0.25.1
+resampy>=0.4.2
+# Utilities
+tqdm>=4.65.0
+huggingface-hub>=0.17.0
+safetensors>=0.3.0
+# System dependencies note:
+# espeak-ng is required by phonemizer and should be installed via packages.txt

setup_diffrhythm2_src.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+# Setup script for HuggingFace Spaces
+# Clones DiffRhythm2 source code if not present
+set -e
+echo "🔧 Setting up DiffRhythm2 source code..."
+MODELS_DIR="models"
+DR2_SRC_DIR="$MODELS_DIR/diffrhythm2_source"
+# Create models directory
+mkdir -p "$MODELS_DIR"
+# Check if DiffRhythm2 source exists
+if [ ! -d "$DR2_SRC_DIR" ]; then
+    echo "📥 Cloning DiffRhythm2 source repository..."
+    git clone https://github.com/ASLP-lab/DiffRhythm2.git "$DR2_SRC_DIR"
+    echo "✅ DiffRhythm2 source cloned"
+else
+    echo "✅ DiffRhythm2 source already exists"
+fi
+echo "✅ Setup complete"