Local_LLM_Prompt_Enhancer/img2img_expansion_engine.py at main · EricRollei/Local_LLM_Prompt_Enhancer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
"""
Image-to-Image Expansion Engine
Platform-aware prompt expansion for image generation models
"""

from typing import Dict, Tuple, Optional, Sequence
from .platforms import get_platform_config, format_for_platform, get_negative_prompt_for_platform


class ImageToImageExpander:
    """Expands image descriptions and change requests into platform-optimized prompts"""

    def __init__(self):
        self.platforms = {}

    def expand_img2img_prompt(
        self,
        image_description: str,
        change_request: str,
        platform: str,
    aesthetic_controls: Optional[Dict] = None,
    custom_negatives: Optional[Sequence[str]] = None
    ) -> Tuple[str, str, Dict]:
        """
        Expand image-to-image prompt with platform awareness

        Args:
            image_description: Vision model's description of input image
            change_request: User's description of desired changes
            platform: Target platform (flux, wan22, hunyuan_image, etc.)
            aesthetic_controls: Style, lighting, composition controls
            custom_negatives: Additional negative terms

        Returns:
            Tuple of (positive_prompt, negative_prompt, breakdown_dict)
        """

        platform_config = get_platform_config(platform)

        # Build the system prompt for LLM expansion
        system_prompt = self._build_expansion_prompt(
            platform,
            platform_config,
            aesthetic_controls
        )

        # Build user prompt combining image desc + changes
        user_prompt = self._build_user_prompt(
            image_description,
            change_request,
            platform_config
        )

        # Build breakdown for user reference
        breakdown = {
            "platform": platform,
            "platform_name": platform_config["name"],
            "prompt_style": platform_config["prompt_style"],
            "optimal_length": platform_config["optimal_length"],
            "image_description": image_description,
            "change_request": change_request,
            "aesthetic_controls": aesthetic_controls,
            "quality_emphasis": bool(platform_config.get("quality_emphasis", True))
        }

        return system_prompt, user_prompt, breakdown

    def _build_expansion_prompt(
        self,
        platform: str,
        config: dict,
        aesthetic_controls: Optional[Dict]
    ) -> str:
        """Build LLM system prompt with platform-specific instructions"""
        quality_emphasis = bool(config.get("quality_emphasis", True))

        prompt = f"""You are an expert prompt engineer for {config['name']} image generation.

CRITICAL OUTPUT RULES:
1. Output ONLY the final prompt - no labels, explanations, or meta-commentary
2. Do NOT include phrases like "Here is...", "Prompt:", etc.
3. Write as a comma-separated list of descriptive elements
4. Start directly with the description

TARGET PLATFORM: {config['name']}
Platform Description: {config['description']}
Prompting Style: {config['prompt_style']}
Optimal Length: {config['optimal_length']}

"""

        # Add platform-specific preferences
        if config.get("preferences"):
            prompt += "\nPLATFORM PREFERENCES:\n"
            for pref in config["preferences"]:
                prompt += f"- {pref}\n"

        # Add quality tokens if enabled
        if quality_emphasis and config.get("quality_tokens"):
            prompt += f"\nQUALITY TOKENS TO USE: {', '.join(config['quality_tokens'][:5])}\n"

        # Add things to avoid
        if config.get("avoid"):
            prompt += "\nAVOID:\n"
            for avoid in config["avoid"]:
                prompt += f"- {avoid}\n"

        # Platform-specific instructions
        if platform == "qwen_image_edit":
            prompt += """
EDIT MODE INSTRUCTIONS:
- Be VERY concise (20-50 tokens max)
- Focus ONLY on what changes
- Use clear change language: "change X to Y", "add Z", "remove W"
- Do NOT re-describe unchanged elements
- Include preservation hints if needed: "keep background", "maintain composition"
"""

        elif platform in {"flux", "flux_kontex"}:
            prompt += """
FLUX FAMILY INSTRUCTIONS:
- Use natural, detailed language with cinematic storytelling
- Include style references when appropriate
- Use photography/artistic terms and sensory details
- Front-load important concepts
- Quality modifiers are optional; focus on evocative description
"""

        elif platform == "wan22":
            prompt += """
WAN 2.2 INSTRUCTIONS:
- Use technical cinematography terminology
- Structure: subject, setting, lighting, composition
- Be specific about lighting types and composition
- Medium length (50-100 tokens)
"""

        elif platform == "hunyuan_image":
            prompt += """
HUNYUAN INSTRUCTIONS:
- Clear, simple English (avoid complex vocabulary)
- Focus on photorealism
- Concise but descriptive (40-80 tokens)
- Direct, straightforward descriptions
"""

        elif platform == "qwen_image":
            prompt += """
QWEN INSTRUCTIONS:
- Natural, conversational language
- Balanced detail level (50-100 tokens)
- Good with diverse styles and cultural elements
- Professional but not overly technical
"""

        elif platform == "sd_xl":
            prompt += """
SDXL INSTRUCTIONS:
- Token-aware: 40-75 tokens optimal
- Front-load important concepts
- Quality tokens at the start
- Can use emphasis with (parentheses)
"""

        # Add aesthetic controls if provided
        if aesthetic_controls:
            prompt += self._format_aesthetic_controls(aesthetic_controls, platform)

        prompt += """
FINAL REMINDERS:
- Output ONLY the prompt text
- Follow platform-specific preferences
- Combine image description with requested changes seamlessly
- Use appropriate length for platform
"""

        return prompt

    def _build_user_prompt(
        self,
        image_description: str,
        change_request: str,
        config: dict
    ) -> str:
        """Build user prompt for LLM"""

        parts = []

        if image_description:
            parts.append(f"Current image: {image_description}")

        if change_request:
            parts.append(f"Requested changes: {change_request}")
        else:
            parts.append("No changes requested - enhance and optimize the description for generation")

        return " | ".join(parts)

    def _format_aesthetic_controls(self, controls: Dict, platform: str) -> str:
        """Format aesthetic controls for system prompt"""

        formatted = "\n=== AESTHETIC CONTROLS ===\n"
        formatted += "User has specified these requirements. You MUST incorporate them:\n\n"

        control_map = {
            "art_style": "Art Style",
            "lighting_type": "Lighting",
            "composition": "Composition",
            "color_palette": "Color Palette",
            "mood": "Mood/Atmosphere",
            "detail_level": "Detail Level",
            "quality_preset": "Quality Preset"
        }

        for key, value in controls.items():
            if value and value.lower() not in ["auto", "none", ""]:
                label = control_map.get(key, key.replace("_", " ").title())
                formatted += f"- {label}: {value}\n"

        formatted += "\nSeamlessly integrate these into the prompt.\n"
        return formatted

    def generate_negative_prompt(
        self,
        platform: str,
        custom_negatives: Optional[Sequence[str]] = None
    ) -> str:
        """Generate platform-optimized negative prompt"""
        return get_negative_prompt_for_platform(platform, custom_negatives)

    def parse_llm_response(self, response: str) -> str:
        """Clean and parse LLM response"""

        cleaned = response.strip()

        # Remove common artifacts
        artifacts = [
            "here is the prompt:", "here's the prompt:",
            "prompt:", "final prompt:", "output:",
            "here is:", "here's:"
        ]

        cleaned_lower = cleaned.lower()
        for artifact in artifacts:
            if artifact in cleaned_lower:
                idx = cleaned_lower.index(artifact)
                cleaned = cleaned[idx + len(artifact):].strip()
                cleaned_lower = cleaned.lower()

        # Remove markdown code blocks if present
        if cleaned.startswith("```"):
            lines = cleaned.split('\n')
            cleaned = '\n'.join(lines[1:-1]) if len(lines) > 2 else cleaned

        # Fallback: use original if cleaned is too short
        if len(cleaned) < 20:
            cleaned = response.strip()

        return cleaned