def intent_disambiguation()

in src/dfcx_scrapi/tools/validation_util.py [0:0]
65 lines of code
8 McCabe index (conditional complexity)

    def intent_disambiguation(self, agent_id, refresh=False, flow=None):
        """Obtains the intent disambiguation tasks from the validation tool

        Args:
          refresh: (optional) False means validation results are pulled
            as is. True means the validation tool is refreshed then
            results are pulled
          flow: (optional) If specified results are returned for the
            indicated flow display name

        Returns:
          Dictionary of intent disambiguation Validation results
          in two dataframes.
            extended: All intent disambiguation validtion results as
              seperate instances. If 5 training phrases conflict
              in 5 intents they will be shown as 5 rows.
            compact: Only showing the first instance of a conflict
              for each grouping. If 5 trainig phrases conflic in 5 intents
              only the first training phrase will show.
        """

        if refresh:
            validation = self.agents.validate_agent(agent_id)
        else:
            validation = self.agents.get_validation_result(agent_id=agent_id)

        validation_df = self.validation_results_to_dataframe(validation)
        if flow:
            validation_df = validation_df[validation_df["flow"] == flow]

        # Parse df
        resources = validation_df.columns
        resources = [r for r in resources if "resource" in r]
        validation_df = validation_df[["flow", "detail"] + resources]

        disambig_id, intents_list, tp_list, id_ = [], [], [], 0
        flows = []
        phrase = "Multiple intents share training phrases which are too similar"
        for _, row in validation_df.iterrows():
            deets, flow = row["detail"], row["flow"]
            if bool(re.search(phrase, deets)):
                intents = re.findall("Intent '(.*)': training phrase ", deets)
                training_phrases = re.findall("training phrase '(.*)'", deets)
                intents_list = intents_list + intents
                tp_list = tp_list + training_phrases
                disambig_id = disambig_id + ([id_] * len(training_phrases))
                flows = flows + ([flow] * len(training_phrases))
                id_ += 1



        extraction = pd.DataFrame()
        extraction["disambig_id"] = disambig_id
        extraction.insert(0, "flow", flows)
        extraction["intent"] = intents_list
        extraction["training_phrase"] = tp_list

        if extraction.empty:
            logging.info(
                "Validation results do not contain clashing intent phrases.")
            return None

        intent_options = (
            extraction.groupby(["disambig_id"])["intent"]
            .apply(list)
            .reset_index()
            .rename(columns={"intent": "intents"})
        )
        intent_options["intents"] = intent_options.apply(
            lambda x: list(set(x["intents"])), axis=1
        )

        extraction = pd.merge(
            extraction, intent_options, on=["disambig_id"], how="left"
        )

        internal = extraction.copy()

        internal["intent_count"] = internal.apply(
            lambda x: len(x["intents"]), axis=1
        )
        external = (
            extraction.groupby(["flow", "disambig_id"])
            .agg(
                {
                    "training_phrase": "first",
                    "intents": "first",
                    "intent": "count",
                }
            )
            .reset_index()
            .rename(columns={"intent": "conflicting_tp_count"})
        )
        external["intent_count"] = external.apply(
            lambda x: len(x["intents"]), axis=1
        )

        return {"extended": internal, "compact": external}