in data_measurements/zipf/zipf.py [0:0]
def calc_fit(self):
"""
Uses the powerlaw package to fit the observed frequencies
to a zipfian distribution.
We use the KS-distance to fit, as that seems more appropriate that MLE.
"""
logs.info("Fitting based on input vocab counts.")
self._make_rank_column()
# Note another method for determining alpha might be defined by
# (Newman, 2005): alpha = 1 + n * sum(ln( xi / xmin )) ^ -1
self.fit = powerlaw.Fit(self.observed_counts, fit_method="KS",
discrete=True)
# This should probably be a pmf (not pdf); using discrete=True above.
# original_data=False uses only the fitted data (within xmin and xmax).
# pdf_bin_edges: The portion of the data within the bin.
# observed_pdf: The probability density function (normalized histogram)
# of the data.
pdf_bin_edges, observed_pdf = self.fit.pdf(original_data=False)
# See the 'Distribution' class described here for info:
# https://pythonhosted.org/powerlaw/#powerlaw.Fit.pdf
theoretical_distro = self.fit.power_law
# The probability density function (normalized histogram) of the
# theoretical distribution.
predicted_pdf = theoretical_distro.pdf()
self._set_fit_vars(observed_pdf, predicted_pdf, theoretical_distro)