in use-cases/model-fine-tuning-pipeline/data-preparation/gemma-it/src/dataprep.py [0:0]
def extract_product_details(text):
# Initialize empty string
output_string = ""
# Extract content before "Description:"
match = re.search(r"(.*?)Description:", text, re.DOTALL)
if match:
content_before_description = match.group(1)
# Remove <br> tags and "Product Category:" line
cleaned_content = content_before_description.replace("<br>", "\n")
lines = [
line.strip()
for line in cleaned_content.splitlines()
if line.strip() and not line.startswith("Product Category:")
]
# Extract and parse attributes
match_attributes = re.search(
r"Attributes:\s*(\{.*?\})", cleaned_content, re.DOTALL
)
if match_attributes:
attributes_str = match_attributes.group(1)
attributes = json.loads(attributes_str)
# Append formatted output to output_string
for line in lines:
if not line.startswith("Attributes:"):
output_string += line + "\n"
output_string += "Product Details:\n"
for key, value in attributes.items():
output_string += f"- {key}: {value}\n"
# Return the final string
return output_string