prompts/stories/build_openhermes_stories_prompts.py (42 lines of code) (raw):

import argparse from datasets import load_dataset STYLES = {"young_children_story": """Write an educational story (3-5 paragraphs) targeted at young children using simple words. The story should be inspired from this text snippet: “<EXTRACT>” The story doesn’t have to be addressing everything in the snippet, it is there just for inspiration. The story should have the following features: - Science integration: embed basic science concepts within the story, explaining them through the characters' adventures and discoveries. For example, if the story includes a scene where characters are looking at the sky, you could have them wonder why it's blue and explain the physics behind in grade school level. - Dialogue: include at least one dialogue and insightful conversation. - Unexpected twist: conclude with a twist that doesn't resolve as hoped, but leaves a clear lesson about life and science. Do not start with classic sentences like "Once upon a time", be creative.""", "problem_solving_story": """Write a story that explores a situation slightly related to this text snippet: “<EXTRACT>” The story should unfold through the characters interactions, decisions, and the consequences of their actions. Aim to weave in common sense lessons and social cues. The narrative should cater to a diverse age group, including at least one dialogue and presenting both positive and negative outcomes. Do not start with classic sentences like "Once upon a time", be creative.""", "reddit_post": """Write a real-life story shared by someone in a reddit forum. The story should be somehow related to this text snippet: “<EXTRACT>” The story should include: - Niche interests or humor: dive into specific hobbies, interests, or humorous situations - An unexpected plot twist or engaging conflict: introduce a relatable yet challenging situation or dilemma that the author faced. - Reflection and insight: end with a resolution that offers a new understanding, a sense of community, or a personal revelation, much like the conclusions drawn in forum discussions. Start the story right away. Do not start with sentences like "Once upon a time" as this is a reddit post and not a novel, you should also avoid starting with classic sentences like "A few years ago" or "A few years back", be creative."""} EXTRACT_SIZE = 1000 def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--repo_id", type=str, default="HuggingFaceTB/prompts_stories_openhermes") parser.add_argument("--generation_style", type=str, default="problem_solving_story") parser.add_argument("--run_all_styles", action="store_true") return parser.parse_args() def build_prompt(x, style="forums_story"): """Build the prompt based on the generation type""" snippet = x["prompt"].strip() snippet = snippet[:min(len(snippet), EXTRACT_SIZE)] prompt = STYLES[style].replace("<EXTRACT>", snippet) return {f"prompt_{style}": prompt} if __name__ == "__main__": args = get_args() print(f"Loading ultrachat data...") ds = load_dataset("HuggingFaceTB/openhermes_filtered", split="train", num_proc=36) if args.run_all_styles: suffix = "" for style in STYLES.keys(): print(f"📖 Building prompts with a {style}...") ds = ds.map(build_prompt, num_proc=48, fn_kwargs={"style": style}) else: suffix = f"_{args.generation_style}" print(f"📖 Building prompts with a {args.generation_style}...") ds = ds.map(build_prompt, num_proc=48, fn_kwargs={"style": args.generation_style}) print(ds) print(ds) print(ds[0]["prompt_young_children_story"]) print("-"*100) print(ds[1]["prompt_problem_solving_story"]) print("-"*100) print(ds[2]["prompt_reddit_post"]) ds.push_to_hub(f"{args.repo_id}{suffix}", private=True) print(f"✅ Data available at {args.repo_id}{suffix}")