in utils/gradio_utils.py [0:0]
def expander_text_duplicates(dstats, column_id=""):
with st.expander(f"Text Duplicates{column_id}", expanded=False):
st.caption(
"Use this widget to identify text strings that appear more than "
"once."
)
st.markdown(
"A model's training and testing may be negatively affected by "
"unwarranted duplicates "
"([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
)
st.markdown("------")
st.write(
"### Here is the list of all the duplicated items and their counts "
"in the dataset."
)
if not dstats.duplicates_results:
st.write("There are no duplicates in this dataset! 🥳")
else:
st.write("The fraction of the data that is a duplicate is:")
st.write(str(round(dstats.dups_frac, 4)))
# TODO: Check if this is slow when the size is large --
# Should we store as dataframes?
# Dataframes allow this to be interactive.
st.dataframe(ds_utils.counter_dict_to_df(dstats.dups_dict))