113 lines
3.4 KiB
Python
113 lines
3.4 KiB
Python
import re
|
|
|
|
|
|
def extract_html_content(text: str, stack: str = "react-tailwind") -> str:
|
|
"""
|
|
Extract code content from LLM response based on technology stack.
|
|
|
|
Args:
|
|
text: The raw text response from LLM
|
|
stack: Technology stack ("react-tailwind", "html-tailwind", "svg")
|
|
|
|
Returns:
|
|
str: Extracted code content
|
|
"""
|
|
# Remove markdown code blocks if present
|
|
text = re.sub(r"```[\w]*\n|```", "", text)
|
|
|
|
if stack == "svg":
|
|
# Extract SVG content
|
|
svg_match = re.search(r"(<svg.*?>.*?</svg>)", text, re.DOTALL)
|
|
if svg_match:
|
|
return svg_match.group(1)
|
|
elif stack == "react-tailwind":
|
|
# Extract React component content
|
|
react_match = re.search(r"(export default function.*?})\s*$", text, re.DOTALL)
|
|
if react_match:
|
|
return react_match.group(1)
|
|
# Alternative: look for const/function component definition
|
|
alt_match = re.search(
|
|
r"((?:const|function)\s+\w+\s*=?\s*(?:\([^)]*\))?\s*=>?\s*{.*?})\s*$",
|
|
text,
|
|
re.DOTALL,
|
|
)
|
|
if alt_match:
|
|
return alt_match.group(1)
|
|
|
|
# Default: try to extract content within <html> tags
|
|
html_match = re.search(r"(<html.*?>.*?</html>)", text, re.DOTALL)
|
|
if html_match:
|
|
return html_match.group(1)
|
|
|
|
# If no specific patterns match, try to extract any HTML-like content
|
|
body_match = re.search(r"(<body.*?>.*?</body>)", text, re.DOTALL)
|
|
if body_match:
|
|
return f"<html>\n{body_match.group(1)}\n</html>"
|
|
|
|
div_match = re.search(r"(<div.*?>.*?</div>)", text, re.DOTALL)
|
|
if div_match:
|
|
return f"<html>\n<body>\n{div_match.group(1)}\n</body>\n</html>"
|
|
|
|
# If no patterns match, clean up the text and return it
|
|
cleaned_text = text.strip()
|
|
print(
|
|
f"[Code Extraction] No specific pattern found for stack '{stack}'. Raw content:\n{cleaned_text}"
|
|
)
|
|
return cleaned_text
|
|
|
|
|
|
def clean_code_content(code: str) -> str:
|
|
"""
|
|
Clean and format the extracted code content.
|
|
|
|
Args:
|
|
code: Raw code content
|
|
|
|
Returns:
|
|
str: Cleaned and formatted code
|
|
"""
|
|
# Remove leading/trailing whitespace
|
|
code = code.strip()
|
|
|
|
# Remove extra blank lines
|
|
code = re.sub(r"\n\s*\n", "\n\n", code)
|
|
|
|
# Ensure proper indentation
|
|
lines = code.split("\n")
|
|
indent_level = 0
|
|
formatted_lines = []
|
|
|
|
for line in lines:
|
|
# Adjust indent level based on brackets/braces
|
|
stripped_line = line.strip()
|
|
if stripped_line.endswith("{"):
|
|
formatted_lines.append(" " * indent_level + stripped_line)
|
|
indent_level += 1
|
|
elif stripped_line.startswith("}"):
|
|
indent_level = max(0, indent_level - 1)
|
|
formatted_lines.append(" " * indent_level + stripped_line)
|
|
else:
|
|
formatted_lines.append(" " * indent_level + stripped_line)
|
|
|
|
return "\n".join(formatted_lines)
|
|
|
|
|
|
def extract_code_content(text: str, stack: str = "react-tailwind") -> str:
|
|
"""
|
|
Main function to extract and clean code content.
|
|
|
|
Args:
|
|
text: Raw text from LLM response
|
|
stack: Technology stack being used
|
|
|
|
Returns:
|
|
str: Final cleaned and formatted code
|
|
"""
|
|
# Extract the relevant code content
|
|
extracted_content = extract_html_content(text, stack)
|
|
|
|
# Clean and format the code
|
|
cleaned_content = clean_code_content(extracted_content)
|
|
|
|
return cleaned_content
|