text/data/smoltalk/constraints/pipeline/pipeline.py (244 lines of code) (raw):

from distilabel.llms import vLLM from distilabel.pipeline import Pipeline from distilabel.steps.tasks import MagpieGenerator from ifeval_tasks import ( IFEvalInstructionIdListAssignator, IFEvalKwargsAssignator, ) from json_schemas import ( IFEVAL_INSTRUCTION_ID_LIST_JSON_SCHEMA, IFEVAL_RESPONSE_VERIFICATION_FUNCTION_ARGUMENTS_JSON_SCHEMA, ) from system_prompts import IFEVAL_SYSTEM_PROMPT IFEVAL_INSTRUCTION_CONFLICTS = { "keywords:existence": {"keywords:existence"}, "keywords:frequency": {"keywords:frequency"}, "keywords:forbidden_words": {"keywords:forbidden_words"}, "keywords:letter_frequency": {"keywords:letter_frequency"}, "language:response_language": { "change_case:english_capital", "change_case:english_lowercase", "startend:end_checker", "keywords:frequency", "keywords:forbidden_words", "detectable_format:multiple_sections", "keywords:existence", "language:response_language", }, "length_constraints:number_sentences": {"length_constraints:number_sentences"}, "length_constraints:number_paragraphs": { "length_constraints:number_sentences", "length_constraints:nth_paragraph_first_word", "length_constraints:number_paragraphs", }, "length_constraints:number_words": {"length_constraints:number_words"}, "length_constraints:nth_paragraph_first_word": { "length_constraints:nth_paragraph_first_word", "length_constraints:number_paragraphs", }, "detectable_content:number_placeholders": { "detectable_content:number_placeholders" }, "detectable_content:postscript": {"detectable_content:postscript"}, "detectable_format:number_bullet_lists": {"detectable_format:number_bullet_lists"}, "detectable_format:constrained_response": { "startend:quotation", "length_constraints:number_words", "detectable_format:constrained_response", "change_case:english_capital", "startend:end_checker", "keywords:forbidden_words", "length_constraints:number_sentences", "combination:repeat_prompt", "combination:two_responses", "punctuation:no_comma", "detectable_format:number_highlighted_sections", "change_case:english_lowercase", "detectable_format:number_bullet_lists", "detectable_content:number_placeholders", "keywords:letter_frequency", "keywords:frequency", "length_constraints:number_paragraphs", "keywords:existence", "length_constraints:nth_paragraph_first_word", "detectable_format:title", "change_case:capital_word_frequency", "detectable_format:json_format", "detectable_format:multiple_sections", "detectable_content:postscript", "language:response_language", }, "detectable_format:number_highlighted_sections": { "detectable_format:number_highlighted_sections" }, "detectable_format:multiple_sections": { "detectable_format:multiple_sections", "detectable_format:number_highlighted_sections", "language:response_language", }, "detectable_format:json_format": { "startend:quotation", "length_constraints:number_words", "detectable_format:constrained_response", "change_case:english_capital", "detectable_format:number_bullet_lists", "detectable_content:number_placeholders", "startend:end_checker", "keywords:letter_frequency", "keywords:frequency", "length_constraints:number_paragraphs", "length_constraints:nth_paragraph_first_word", "length_constraints:number_sentences", "language:response_language", "combination:repeat_prompt", "detectable_format:title", "change_case:capital_word_frequency", "combination:two_responses", "detectable_format:json_format", "punctuation:no_comma", "detectable_format:number_highlighted_sections", "detectable_format:multiple_sections", "detectable_content:postscript", "change_case:english_lowercase", }, "detectable_format:title": {"detectable_format:title"}, "combination:two_responses": { "startend:quotation", "length_constraints:number_words", "detectable_format:constrained_response", "change_case:english_capital", "detectable_format:number_bullet_lists", "detectable_content:number_placeholders", "startend:end_checker", "keywords:letter_frequency", "keywords:frequency", "length_constraints:number_paragraphs", "length_constraints:nth_paragraph_first_word", "length_constraints:number_sentences", "combination:repeat_prompt", "change_case:capital_word_frequency", "combination:two_responses", "detectable_format:json_format", "detectable_format:number_highlighted_sections", "detectable_format:multiple_sections", "detectable_content:postscript", "change_case:english_lowercase", }, "combination:repeat_prompt": { "startend:quotation", "length_constraints:number_words", "detectable_format:constrained_response", "change_case:english_capital", "detectable_format:number_bullet_lists", "detectable_content:number_placeholders", "startend:end_checker", "keywords:letter_frequency", "keywords:forbidden_words", "keywords:frequency", "length_constraints:number_paragraphs", "length_constraints:nth_paragraph_first_word", "length_constraints:number_sentences", "language:response_language", "combination:repeat_prompt", "change_case:capital_word_frequency", "combination:two_responses", "detectable_format:json_format", "detectable_format:number_highlighted_sections", "detectable_format:multiple_sections", "detectable_content:postscript", "change_case:english_lowercase", }, "startend:end_checker": {"startend:end_checker"}, "change_case:capital_word_frequency": { "change_case:english_capital", "change_case:capital_word_frequency", "change_case:english_lowercase", }, "change_case:english_capital": {"change_case:english_capital"}, "change_case:english_lowercase": { "change_case:english_capital", "change_case:english_lowercase", }, "punctuation:no_comma": {"punctuation:no_comma"}, "startend:quotation": {"startend:quotation", "detectable_format:title"}, } with Pipeline(name="ifeval-like-dataset").ray() as pipeline: instruction_generator = MagpieGenerator( llm=vLLM( model="Qwen/Qwen2.5-72B-Instruct", tokenizer="Qwen/Qwen2.5-72B-Instruct", magpie_pre_query_template="qwen2", extra_kwargs={ "tensor_parallel_size": 8, "max_model_len": 8192, "enable_prefix_caching": True, }, generation_kwargs={ "temperature": 0.8, "top_p": 1.0, "max_new_tokens": 1024, "stop": [ "<|im_start|>", "<|im_end|>", "<|endoftext|>", "<tool_call>", ], "stop_token_ids": [151643, 151644, 151645, 151657], }, ), system_prompt=IFEVAL_SYSTEM_PROMPT, batch_size=1000, num_rows=500000, ) instruction_id_list_assignator = IFEvalInstructionIdListAssignator( llm=vLLM( model="Qwen/Qwen2.5-72B-Instruct", tokenizer="Qwen/Qwen2.5-72B-Instruct", magpie_pre_query_template="qwen2", extra_kwargs={ "tensor_parallel_size": 8, "max_model_len": 2048, "enable_prefix_caching": True, }, generation_kwargs={ "temperature": 0.2, "max_new_tokens": 256, }, structured_output={ "format": "json", "schema": IFEVAL_INSTRUCTION_ID_LIST_JSON_SCHEMA, }, ), input_batch_size=2000, ) instruction_kwargs_assignator = IFEvalKwargsAssignator( llm=vLLM( model="Qwen/Qwen2.5-72B-Instruct", tokenizer="Qwen/Qwen2.5-72B-Instruct", magpie_pre_query_template="qwen2", extra_kwargs={ "tensor_parallel_size": 8, "max_model_len": 2048, "enable_prefix_caching": True, }, generation_kwargs={ "temperature": 0.2, "max_new_tokens": 512, }, structured_output={ "format": "json", "schema": IFEVAL_RESPONSE_VERIFICATION_FUNCTION_ARGUMENTS_JSON_SCHEMA, }, ), input_batch_size=2000, ) ( instruction_generator >> instruction_id_list_assignator >> instruction_kwargs_assignator ) if __name__ == "__main__": distiset = pipeline.run(use_cache=True) distiset.push_to_hub( "argilla-warehouse/tons-of-ifeval-like-data", include_script=True, private=True )