in src/openai/lib/_validators.py [0:0]
def long_examples_validator(df: pd.DataFrame) -> Remediation:
"""
This validator will suggest to the user to remove examples that are too long.
"""
immediate_msg = None
optional_msg = None
optional_fn = None # type: ignore
ft_type = infer_task_type(df)
if ft_type != "open-ended generation":
def get_long_indexes(d: pd.DataFrame) -> Any:
long_examples = d.apply(lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1)
return d.reset_index().index[long_examples].tolist()
long_indexes = get_long_indexes(df)
if len(long_indexes) > 0:
immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
optional_msg = f"Remove {len(long_indexes)} long examples"
def optional_fn(x: Any) -> Any:
long_indexes_to_drop = get_long_indexes(x)
if long_indexes != long_indexes_to_drop:
sys.stdout.write(
f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n"
)
return x.drop(long_indexes_to_drop)
return Remediation(
name="long_examples",
immediate_msg=immediate_msg,
optional_msg=optional_msg,
optional_fn=optional_fn,
)