def check_data_format

def check_data_format_errors()

in 5-4o_fine_tuning/data_validator.py [0:0]
52 lines of code
24 McCabe index (conditional complexity)

    def check_data_format_errors(self) -> dict:
        """
        Format validation checks:
        - Data Type Check: Checks whether each entry in the dataset is a dictionary (dict). Error type: data_type.
        - Presence of Message List: Checks if a messages list is present in each entry. Error type: missing_messages_list.
        - Message Keys Check: Validates that each message in the messages list contains the keys role and content. Error type: message_missing_key.
        - Unrecognized Keys in Messages: Logs if a message has keys other than role, content, weight, function_call, and name. Error type: message_unrecognized_key.
        - Role Validation: Ensures the role is one of "system", "user", or "assistant". Error type: unrecognized_role.
        - Content Validation: Verifies that content has textual data and is a string. Error type: missing_content.
        - Assistant Message Presence: Checks that each conversation has at least one message from the assistant. Error type: example_missing_assistant_message.

        # Example usage --> errors = check_format_errors(dataset)
        """
        files_to_check = [
            self.train_file, self.validation_file] if self.validation_file else [self.train_file]

        data_format_errors = {file: False for file in files_to_check}

        for dataset_path in files_to_check:
            data_path = dataset_path
            if not data_path.endswith('.jsonl'):
                raise ValueError(
                    f"The provided dataset path `{data_path}` is not a valid JSONL file.")

            # Load dataset
            with open(data_path) as f:
                dataset = [json.loads(line) for line in f]

            logger.info(
                f"Checking format errors in {data_path}")

            # initial dataset stats
            try:
                logger.info(f"Number of examples: {len(dataset)}")
                logger.info("First example:")
                for message in dataset[0]["messages"]:
                    logger.info(message)
            except KeyError:
                logger.error(
                    "\033[91mNo messages found in the first example.\033[0m")

            # Format error checks
            format_errors = defaultdict(int)

            for ex in dataset:
                if not isinstance(ex, dict):
                    format_errors["data_type"] += 1
                    continue

                messages = ex.get("messages", None)
                if not messages:
                    format_errors["missing_messages_list"] += 1
                    continue

                for message in messages:
                    if "role" not in message or "content" not in message:
                        format_errors["message_missing_key"] += 1

                    if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                        format_errors["message_unrecognized_key"] += 1

                    if message.get("role", None) not in ("system", "user", "assistant", "function"):
                        format_errors["unrecognized_role"] += 1

                    content = message.get("content", None)
                    function_call = message.get("function_call", None)

                    if (not content and not function_call) or not isinstance(content, str):
                        format_errors["missing_content"] += 1

                if not any(message.get("role", None) == "assistant" for message in messages):
                    format_errors["example_missing_assistant_message"] += 1

            if format_errors:
                data_format_errors[data_path] = True
                logger.error(f"\033[91mFound errors in {data_path}:\033[0m")
                for k, v in format_errors.items():
                    logger.error(f"\033[91m  {k}: {v}\033[0m")
            else:
                logger.info(f"\033[92mNo errors found for {data_path}\033[0m")
            logger.info("-----------------------------------")
        return data_format_errors