def synthesize()

in eden/scm/contrib/synthrepo.py [0:0]
199 lines of code
45 McCabe index (conditional complexity)

def synthesize(ui, repo, descpath, **opts):
    """synthesize commits based on a model of an existing repository

    The model must have been generated by :hg:`analyze`. Commits will
    be generated randomly according to the probabilities described in
    the model. If --initfiles is set, the repository will be seeded with
    the given number files following the modeled repository's directory
    structure.

    When synthesizing new content, commit descriptions, and user
    names, words will be chosen randomly from a dictionary that is
    presumed to contain one word per line. Use --dict to specify the
    path to an alternate dictionary to use.
    """
    try:
        fp = hg.openpath(ui, descpath)
    except Exception as err:
        raise error.Abort("%s: %s" % (descpath, err[0].strerror))
    desc = json.load(fp)
    fp.close()

    def cdf(l):
        if not l:
            return [], []
        vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
        t = float(sum(probs, 0))
        s, cdfs = 0, []
        for v in probs:
            s += v
            cdfs.append(s / t)
        return vals, cdfs

    lineschanged = cdf(desc["lineschanged"])
    fileschanged = cdf(desc["fileschanged"])
    filesadded = cdf(desc["filesadded"])
    dirsadded = cdf(desc["dirsadded"])
    filesremoved = cdf(desc["filesremoved"])
    linelengths = cdf(desc["linelengths"])
    parents = cdf(desc["parents"])
    p1distance = cdf(desc["p1distance"])
    p2distance = cdf(desc["p2distance"])
    interarrival = cdf(desc["interarrival"])
    linesinfilesadded = cdf(desc["linesinfilesadded"])
    tzoffset = cdf(desc["tzoffset"])

    dictfile = opts.get("dict") or "/usr/share/dict/words"
    try:
        fp = open(dictfile, "rU")
    except IOError as err:
        raise error.Abort("%s: %s" % (dictfile, err.strerror))
    words = fp.read().splitlines()
    fp.close()

    initdirs = {}
    if desc["initdirs"]:
        for k, v in desc["initdirs"]:
            initdirs[k.encode("utf-8").replace(".hg", "_hg")] = v
        initdirs = renamedirs(initdirs, words)
    initdirscdf = cdf(initdirs)

    def pick(cdf):
        return cdf[0][bisect.bisect_left(cdf[1], random.random())]

    def pickpath():
        return os.path.join(pick(initdirscdf), random.choice(words))

    def makeline(minimum=0):
        total = max(minimum, pick(linelengths))
        c, l = 0, []
        while c < total:
            w = random.choice(words)
            c += len(w) + 1
            l.append(w)
        return " ".join(l)

    wlock = repo.wlock()
    lock = repo.lock()

    nevertouch = {".hgsub", ".hgignore", ".hgtags"}

    progress = ui.progress
    _synthesizing = _("synthesizing")
    _files = _("initial files")
    _changesets = _("changesets")

    # Synthesize a single initial revision adding files to the repo according
    # to the modeled directory structure.
    initcount = int(opts["initfiles"])
    if initcount and initdirs:
        pctx = repo[None].parents()[0]
        dirs = set(pctx.dirs())
        files = {}

        def validpath(path):
            # Don't pick filenames which are already directory names.
            if path in dirs:
                return False
            # Don't pick directories which were used as file names.
            while path:
                if path in files:
                    return False
                path = os.path.dirname(path)
            return True

        for i in xrange(0, initcount):
            ui.progress(_synthesizing, i, unit=_files, total=initcount)

            path = pickpath()
            while not validpath(path):
                path = pickpath()
            data = "%s contents\n" % path
            files[path] = data
            dir = os.path.dirname(path)
            while dir and dir not in dirs:
                dirs.add(dir)
                dir = os.path.dirname(dir)

        def filectxfn(repo, memctx, path):
            return context.memfilectx(repo, memctx, path, files[path])

        ui.progress(_synthesizing, None)
        message = "synthesized wide repo with %d files" % (len(files),)
        mc = context.memctx(
            repo,
            [pctx.node(), nullid],
            message,
            pycompat.iterkeys(files),
            filectxfn,
            ui.username(),
            "%d %d" % util.makedate(),
        )
        initnode = mc.commit()
        if ui.debugflag:
            hexfn = hex
        else:
            hexfn = short
        ui.status(_("added commit %s with %d files\n") % (hexfn(initnode), len(files)))

    # Synthesize incremental revisions to the repository, adding repo depth.
    count = int(opts["count"])
    heads = set(map(repo.changelog.rev, repo.heads()))
    for i in xrange(count):
        progress(_synthesizing, i, unit=_changesets, total=count)

        node = repo.changelog.node
        revs = len(repo)

        def pickhead(heads, distance):
            if heads:
                lheads = sorted(heads)
                rev = revs - min(pick(distance), revs)
                if rev < lheads[-1]:
                    rev = lheads[bisect.bisect_left(lheads, rev)]
                else:
                    rev = lheads[-1]
                return rev, node(rev)
            return nullrev, nullid

        r1 = revs - min(pick(p1distance), revs)
        p1 = node(r1)

        # the number of heads will grow without bound if we use a pure
        # model, so artificially constrain their proliferation
        toomanyheads = len(heads) > random.randint(1, 20)
        if p2distance[0] and (pick(parents) == 2 or toomanyheads):
            r2, p2 = pickhead(heads.difference([r1]), p2distance)
        else:
            r2, p2 = nullrev, nullid

        pl = [p1, p2]
        pctx = repo[r1]
        mf = pctx.manifest()
        mfk = mf.keys()
        changes = {}
        if mfk:
            for __ in xrange(pick(fileschanged)):
                for __ in xrange(10):
                    fctx = pctx.filectx(random.choice(mfk))
                    path = fctx.path()
                    if not (
                        path in nevertouch or fctx.isbinary() or "l" in fctx.flags()
                    ):
                        break
                lines = fctx.data().splitlines()
                add, remove = pick(lineschanged)
                for __ in xrange(remove):
                    if not lines:
                        break
                    del lines[random.randrange(0, len(lines))]
                for __ in xrange(add):
                    lines.insert(random.randint(0, len(lines)), makeline())
                path = fctx.path()
                changes[path] = "\n".join(lines) + "\n"
            for __ in xrange(pick(filesremoved)):
                path = random.choice(mfk)
                for __ in xrange(10):
                    path = random.choice(mfk)
                    if path not in changes:
                        break
        if filesadded:
            dirs = list(pctx.dirs())
            dirs.insert(0, "")
        for __ in xrange(pick(filesadded)):
            pathstr = ""
            while pathstr in dirs:
                path = [random.choice(dirs)]
                if pick(dirsadded):
                    path.append(random.choice(words))
                path.append(random.choice(words))
                pathstr = "/".join(filter(None, path))
            data = (
                "\n".join(makeline() for __ in xrange(pick(linesinfilesadded))) + "\n"
            )
            changes[pathstr] = data

        def filectxfn(repo, memctx, path):
            if path not in changes:
                return None
            return context.memfilectx(repo, memctx, path, changes[path])

        if not changes:
            continue
        if revs:
            date = repo["tip"].date()[0] + pick(interarrival)
        else:
            date = time.time() - (86400 * count)
        # dates in mercurial must be positive, fit in 32-bit signed integers.
        date = min(0x7FFFFFFF, max(0, date))
        user = random.choice(words) + "@" + random.choice(words)
        mc = context.memctx(
            repo,
            pl,
            makeline(minimum=2),
            sorted(changes),
            filectxfn,
            user,
            "%d %d" % (date, pick(tzoffset)),
        )
        newnode = mc.commit()
        heads.add(repo.changelog.rev(newnode))
        heads.discard(r1)
        heads.discard(r2)

    lock.release()
    wlock.release()