def calc_fit()

in data_measurements/zipf/zipf.py [0:0]


    def calc_fit(self):
        """
        Uses the powerlaw package to fit the observed frequencies
        to a zipfian distribution.
        We use the KS-distance to fit, as that seems more appropriate that MLE.
        """
        logs.info("Fitting based on input vocab counts.")

        self._make_rank_column()
        # Note another method for determining alpha might be defined by
        # (Newman, 2005): alpha = 1 + n * sum(ln( xi / xmin )) ^ -1
        self.fit = powerlaw.Fit(self.observed_counts, fit_method="KS",
                                discrete=True)
        # This should probably be a pmf (not pdf); using discrete=True above.
        # original_data=False uses only the fitted data (within xmin and xmax).
        # pdf_bin_edges: The portion of the data within the bin.
        # observed_pdf: The probability density function (normalized histogram)
        # of the data.
        pdf_bin_edges, observed_pdf = self.fit.pdf(original_data=False)
        # See the 'Distribution' class described here for info:
        # https://pythonhosted.org/powerlaw/#powerlaw.Fit.pdf
        theoretical_distro = self.fit.power_law
        # The probability density function (normalized histogram) of the
        # theoretical distribution.
        predicted_pdf = theoretical_distro.pdf()
        self._set_fit_vars(observed_pdf, predicted_pdf, theoretical_distro)