def _get_features()

in simulation/decai/simulation/data/titanic_data_loader.py [0:0]


    def _get_features(self, data: pd.DataFrame):
        """
        Map the data to numbers.
        Also uses some ideas from https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/

        :param data: The data without labels.
        :return: The data mapped to numbers.
        """
        data.drop(columns=['PassengerId', 'Ticket'], inplace=True)
        # , 'Name', 'Ticket', 'Cabin', 'Embarked'
        title_tuples = (
            (' Mr. ', ' Sir. ', ' Don. ', ' Major. ', ' Capt. ', ' Jonkheer. ', ' Rev. ', ' Col. '),
            (' Mrs. ', ' Countess. ', ' Mme. ', ' Lady. '),
            (' Miss. ', ' Mlle. ', ' Ms. '),
            (' Master. ',),
            (' Dr. ',),
        )
        title_to_num = {
            ' Mr. ': 0,
            ' Mrs. ': 1,
            ' Miss. ': 2,
            ' Master. ': 3,
        }

        def _get_title(row):
            result = None
            name = row['Name']
            for index, titles in enumerate(title_tuples):
                for t in titles:
                    if t in name:
                        result = titles[0]
            if result == ' Dr. ':
                if row['Sex'] == 'male':
                    result = ' Mr. '
                else:
                    result = ' Mrs. '
            assert result is not None, f"No title found in {row}."
            result = title_to_num[result]
            return result

        def _get_cabin(row):
            result = -1
            cabin = row['Cabin']
            if isinstance(cabin, str):
                for c in 'ABCDEFGT':
                    if c in cabin:
                        result = ord(c) - ord('A')
                        break
            return result

        result = []
        for index, row in data.iterrows():
            if row['Sex'] == 'male':
                sex = 0
            else:
                sex = 1

            family_size = row['SibSp'] + row['Parch']
            datum = [
                row['Pclass'],
                sex,
                _get_title(row),
                family_size,

                # These features did not help:
                # _get_cabin(row),
                # row['Age'],
                # row['Parch'],
                # row['SibSp'],
                # row['Fare'],
                # row['Fare'] / (family_size + 1),
            ]
            result.append(datum)

        return result