def expander_zipf()

in utils/gradio_utils.py [0:0]


def expander_zipf(dstats, column_id=""):
    z = dstats.z
    zipf_fig = dstats.zipf_fig
    with st.expander(
        f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
    ):
        try:
            _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
            natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
            calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""

            powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
            zipf_summary = (
                    "The optimal alpha based on this dataset is: **"
                    + str(round(z.alpha, 2))
                    + "**, with a KS distance of: **"
                    + str(round(z.ks_distance, 2))
            )
            zipf_summary += (
                    "**.  This was fit with a minimum rank value of: **"
                    + str(int(z.xmin))
                    + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
            )

            alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
            xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
            fit_results_table = pd.DataFrame.from_dict(
                {
                    r"Alpha:": [str("%.2f" % z.alpha)],
                    "KS distance:": [str("%.2f" % z.ks_distance)],
                    "Min rank:": [str("%s" % int(z.xmin))],
                },
                columns=["Results"],
                orient="index",
            )
            fit_results_table.index.name = column_id
            st.caption(
                "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
            )
            st.markdown(_ZIPF_CAPTION)
            st.write(
                """
            A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
    with an ideal α value of 1."""
            )
            st.markdown(
                "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
            )
            st.markdown(
                "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
            )
            st.markdown("-----")
            st.write("### Here is your dataset's Zipf results:")
            st.dataframe(fit_results_table)
            st.write(zipf_summary)
            # TODO: Nice UI version of the content in the comments.
            # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
            # if z.ks_test.pvalue < 0.01:
            #    st.markdown(
            #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
            # else:
            #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
            # st.markdown("Checking the goodness of fit of our observed distribution")
            # st.markdown("to the hypothesized power law distribution")
            # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
            st.plotly_chart(zipf_fig, use_container_width=True)
            if z.alpha > 2:
                st.markdown(alpha_warning)
            if z.xmin > 5:
                st.markdown(xmin_warning)
        except:
            st.write("Under construction!")