def determine_sample

def determine_sample_type()

in evals/elsuite/identifying_variables/renderers/corrset.py [0:0]
24 lines of code
12 McCabe index (conditional complexity)

    def determine_sample_type(self, sample: Sample) -> Tuple[str, List[Set[str]]]:
        """
        Determines the type of sample we have, returning the correlation sets in
        the process. Accounts for unobserved variables by removing them from
        the correlation sets.

        Returns:
            str: The type of causal graph we have, ignoring unobserved variables.
                Either
                    - "many_correl_sets": there are at least two correlation sets, at least
                          one of which has at least two variables.
                    - "single_correl_set": there is only one correlation set.
                    - "only_ind": there are at least two correlation sets, all of which
                        have exactly one variable.
            List[Set[str]]: The list of correlation sets. A correlation set is the
                set of observed variables in a tree from the causal graph
        """
        causal_graph = sample.causal_graph
        graph_trees = graph_utils.find_graph_trees(causal_graph)
        correl_sets = []
        unobserved_vars = set(
            var
            for var in sample.variable_metadata
            if sample.variable_metadata[var]["extra"]["sparsity_rate"]
            > SPARSITY_FOR_UNOBS
        )
        for tree in graph_trees:
            correl_set = set(tree)
            for var in tree:
                if var in unobserved_vars:
                    # correlations to unobserved variables are, well, unobserved
                    correl_set.remove(var)
            correl_sets.append(correl_set)
        # need to check for empty sets, since we removed unobserved variables
        correl_sets = [correl_set for correl_set in correl_sets if len(correl_set) > 0]
        if len(correl_sets) == 1:
            return "single_correl_set", correl_sets
        else:
            for correl_set in correl_sets:
                if len(correl_set) > 1:
                    # at least one set with more than one observed var
                    return "many_correl_sets", correl_sets
            # all sets have only one node
            return "only_ind", correl_sets