def grep()

in eden/scm/edenscm/mercurial/commands/__init__.py [0:0]
211 lines of code
22 McCabe index (conditional complexity)

def grep(ui, repo, pattern, *pats, **opts):
    """search for a pattern in tracked files in the working directory

    The default regexp style is POSIX basic regexps. If no FILE parameters are
    passed in, the current directory and its subdirectories will be searched.

    For the old 'hg grep', which searches through history, see 'histgrep'."""
    # XXX: The current implementation heavily depends on external programs like
    # grep, xargs and biggrep.  Command-line flag support is a bit messy.  It
    # does not look like a source control command (ex. no --rev support).
    # Ideally, `grep` is just `histgrep -r 'wdir()'` and they can be merged.
    # Possible future work are:
    # - For "searching many files in a single revision" use-case, utilize
    #   ripgrep's logic. That "single revision" includes "wdir()". Get rid
    #   of shelling out to grep or xargs.
    # - For "searching few files in a range of revisions" use-case, maybe
    #   try using fastannoate logic.
    # - Find a cleaner way to integrate with FB-only biggrep and fallback
    #   gracefully.
    grepcommandstr = ui.config("grep", "command")
    # Use shlex.split() to split up grepcommandstr into multiple arguments.
    # this allows users to specify a command plus arguments (e.g., "grep -i").
    # We don't use a real shell to execute this, which ensures we won't do
    # bad stuff if their command includes redirects, semicolons, or other
    # special characters etc.
    cmd = shlex.split(grepcommandstr) + [
        "--no-messages",
        "--binary-files=without-match",
        "--with-filename",
        "--regexp=" + pattern,
    ]

    biggrepclient = ui.config(
        "grep",
        "biggrepclient",
        "/usr/local/fbprojects/packages/biggrep.client/stable/biggrep_client",
    )
    biggreptier = ui.config("grep", "biggreptier", "biggrep.master")
    biggrepcorpus = ui.config("grep", "biggrepcorpus")

    # If true, we'll use biggrepclient to perform the grep against some
    # externally maintained index.  We don't provide an implementation
    # of that tool with this repo, just the optional client interface.
    biggrep = ui.configbool("grep", "usebiggrep", None)

    if biggrep is None:
        if (
            "eden" in repo.requirements
            and biggrepcorpus
            and os.path.exists(biggrepclient)
        ):
            biggrep = True

    # Ask big grep to strip out the corpus dir (stripdir) and to include
    # the corpus revision on the first line.
    biggrepcmd = [
        biggrepclient,
        "--stripdir",
        "-r",
        "--expression",
        pattern,
        biggreptier,
        biggrepcorpus,
        "re2",
    ]

    args = []

    if opts.get("after_context"):
        args.append("-A")
        args.append(opts.get("after_context"))
    if opts.get("before_context"):
        args.append("-B")
        args.append(opts.get("before_context"))
    if opts.get("context"):
        args.append("-C")
        args.append(opts.get("context"))
    if opts.get("ignore_case"):
        args.append("-i")
    if opts.get("files_with_matches"):
        args.append("-l")
    if opts.get("line_number"):
        cmd.append("-n")
    if opts.get("invert_match"):
        if biggrep:
            raise error.Abort("Cannot use invert_match option with big grep")
        cmd.append("-v")
    if opts.get("word_regexp"):
        cmd.append("-w")
        biggrepcmd[4] = "\\b%s\\b" % pattern
    if opts.get("extended_regexp"):
        cmd.append("-E")
        # re2 is already mostly compatible by default, so there are no options
        # to apply for this.
    if opts.get("fixed_strings"):
        cmd.append("-F")
        # using bgs rather than bgr switches the engine to fixed string matches
        biggrepcmd[0] = "bgs"
    if opts.get("perl_regexp"):
        cmd.append("-P")
        # re2 is already mostly pcre compatible, so there are no options
        # to apply for this.

    biggrepcmd += args
    cmd += args

    # color support, using the color extension
    colormode = getattr(ui, "_colormode", "")
    if colormode == "ansi":
        cmd.append("--color=always")
        biggrepcmd.append("--color=on")

    # Copy match specific options
    match_opts = {}
    for k in ("include", "exclude"):
        if k in opts:
            match_opts[k] = opts.get(k)

    reporoot = os.path.dirname(repo.path)
    wctx = repo[None]

    if not pats:
        # Search everything in the current directory
        m = scmutil.match(wctx, ["."], match_opts)
        # Scope biggrep to the cwd equivalent path, relative to the root
        # of its corpus.
        if repo.getcwd():
            biggrepcmd += ["-f", repo.getcwd()]
    else:
        # Search using the specified patterns
        m = scmutil.match(wctx, pats, match_opts)
        # Scope biggrep to the same set of patterns.  Ideally we'd have
        # a way to translate the matcher object to a regex, but we don't
        # so we cross fingers and hope that the patterns are simple filenames.
        biggrepcmd += [
            "-f",
            "(%s)"
            % "|".join(
                [os.path.normpath(os.path.join(repo.getcwd(), f)) for f in pats]
            ),
        ]

    # Add '--' to make sure grep recognizes all remaining arguments
    # (passed in by xargs) as filenames.
    cmd.append("--")

    if biggrep:
        p = subprocess.Popen(
            biggrepcmd,
            bufsize=-1,
            close_fds=util.closefds,
            stdout=subprocess.PIPE,
            cwd=reporoot,
        )
        out, err = p.communicate()
        lines = pycompat.decodeutf8(out.rstrip()).split("\n")

        revisionline = lines[0][1:]

        # Biggrep has two output formats. If the query only hit one shard, it
        # returns a "#HASH:timestamp" format indicating the revision and time of
        # the shards snapshot. If it hits multiple shards, it returns a
        # "#name1=HASH:timestamp,name2=HASH:timestamp,name3=..." format.
        if "=" in revisionline:
            corpusrevs = []
            shards = revisionline.split(",")
            for shard in shards:
                name, info = shard.split("=")
                # biggrep doesn't have a consistent format
                if ":" in info:
                    corpusrev, timestamp = info.split(":")
                else:
                    corpusrev = info
                corpusrevs.append(corpusrev)

            if not corpusrevs:
                raise error.Abort(
                    _("unable to resolve biggrep revision: %s") % revisionline,
                    hint=_("pass `--config grep.usebiggrep=False` to bypass biggrep"),
                )

            # Sort so our choice of revision is deterministic
            corpusrev = sorted(corpusrevs)[0]
        else:
            corpusrev, timestamp = revisionline.split(":", 1)

        lines = lines[1:]

        resultsbyfile = {}
        includelineno = opts.get("line_number")
        fileswithmatches = opts.get("files_with_matches")

        for line in lines:
            try:
                filename, lineno, colno, context = line.split(":", 3)
            except Exception:
                binaryfile = re.match("Binary file (.*) matches", line)
                if binaryfile:
                    filename = binaryfile.group(1)
                    lineno = 0
                    colno = 0
                    context = None
                elif fileswithmatches:
                    filename = line
                else:
                    # If we couldn't parse the line, just pass it thru
                    ui.write(line)
                    ui.write("\n")
                    continue

            unescapedfilename = util.stripansiescapes(filename)

            # filter to just the files that match the list supplied
            # by the caller
            if m(unescapedfilename):
                # relativize the path to the CWD.  Note that `filename` will
                # often have escape sequences, so we do a substring replacement
                filename = filename.replace(unescapedfilename, m.rel(unescapedfilename))

                if unescapedfilename not in resultsbyfile:
                    resultsbyfile[unescapedfilename] = []

                # re-assemble the output
                if fileswithmatches:
                    resultsbyfile[unescapedfilename].append("%s\n" % filename)
                elif lineno == 0 and colno == 0 and context is None:
                    # Take care with binary file matches!
                    resultsbyfile[unescapedfilename].append(
                        "Binary file %s matches\n" % filename
                    )
                elif includelineno:
                    resultsbyfile[unescapedfilename].append(
                        "%s:%s:%s\n" % (filename, lineno, context)
                    )
                else:
                    resultsbyfile[unescapedfilename].append(
                        "%s:%s\n" % (filename, context)
                    )

        # Now check to see what has changed since the corpusrev
        # we're going to need to grep those and stitch the results together
        try:
            corpusbin = bin(corpusrev)
            changes = repo.status(corpusbin, None, m)
        except error.RepoLookupError:
            # We don't have the rev locally, so go get it.
            if not ui.quiet:
                ui.write_err(_("pulling biggrep corpus commit %s\n") % (hex(corpusbin)))

            # Redirect the pull output to stderr so that we don't break folks
            # that are parsing the `hg grep` output
            try:
                fout = ui.fout.swap(ui.ferr)
                pull = cmdutil.findcmd("pull", table)[1][0]
                pull(ui, repo)
            finally:
                ui.fout.swap(fout)

            # Try to resolve that rev again now
            try:
                changes = repo.status(corpusbin, None, m)
            except error.RepoLookupError:
                # print the results we've gathered so far.  We're not sure
                # how things differ, so we'll follow up with a warning.
                ui.pager("grep")
                for lines in resultsbyfile.values():
                    for line in lines:
                        ui.write(line)

                ui.warn(
                    _(
                        "The results above are based on revision %s\n"
                        "which is not available locally and thus may be inaccurate.\n"
                        "To get accurate results, run `hg pull` and re-run "
                        "your grep.\n"
                    )
                    % corpusrev
                )
                return

        # which files we're going to search locally
        filestogrep = set()

        # files that have been changed or added need to be searched again
        for f in changes.modified:
            resultsbyfile.pop(f, None)
            filestogrep.add(f)
        for f in changes.added:
            resultsbyfile.pop(f, None)
            filestogrep.add(f)

        # files that have been removed since the corpus rev cannot match
        for f in changes.removed:
            resultsbyfile.pop(f, None)
        for f in changes.deleted:
            resultsbyfile.pop(f, None)

        # Having filtered out the changed files from the big grep results,
        # we can now print those that remain.
        ui.pager("grep")
        for lines in resultsbyfile.values():
            for line in lines:
                ui.write(line)

        # pass on any changed files to the local grep
        if len(filestogrep) > 0:
            # Ensure that the biggrep results are flushed before we
            # start to intermingle with the local grep process output
            ui.flush()
            return _rungrep(ui, cmd, sorted(filestogrep), m)

        return 0

    islink = repo.wvfs.islink
    status = repo.dirstate.status(m, False, True, False)
    files = sorted(status.clean + status.modified + status.added)
    files = [file for file in files if not islink(file)]

    ui.pager("grep")
    return _rungrep(ui, cmd, files, m)