in simulation/decai/simulation/data/titanic_data_loader.py [0:0]
def _get_features(self, data: pd.DataFrame):
"""
Map the data to numbers.
Also uses some ideas from https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
:param data: The data without labels.
:return: The data mapped to numbers.
"""
data.drop(columns=['PassengerId', 'Ticket'], inplace=True)
# , 'Name', 'Ticket', 'Cabin', 'Embarked'
title_tuples = (
(' Mr. ', ' Sir. ', ' Don. ', ' Major. ', ' Capt. ', ' Jonkheer. ', ' Rev. ', ' Col. '),
(' Mrs. ', ' Countess. ', ' Mme. ', ' Lady. '),
(' Miss. ', ' Mlle. ', ' Ms. '),
(' Master. ',),
(' Dr. ',),
)
title_to_num = {
' Mr. ': 0,
' Mrs. ': 1,
' Miss. ': 2,
' Master. ': 3,
}
def _get_title(row):
result = None
name = row['Name']
for index, titles in enumerate(title_tuples):
for t in titles:
if t in name:
result = titles[0]
if result == ' Dr. ':
if row['Sex'] == 'male':
result = ' Mr. '
else:
result = ' Mrs. '
assert result is not None, f"No title found in {row}."
result = title_to_num[result]
return result
def _get_cabin(row):
result = -1
cabin = row['Cabin']
if isinstance(cabin, str):
for c in 'ABCDEFGT':
if c in cabin:
result = ord(c) - ord('A')
break
return result
result = []
for index, row in data.iterrows():
if row['Sex'] == 'male':
sex = 0
else:
sex = 1
family_size = row['SibSp'] + row['Parch']
datum = [
row['Pclass'],
sex,
_get_title(row),
family_size,
# These features did not help:
# _get_cabin(row),
# row['Age'],
# row['Parch'],
# row['SibSp'],
# row['Fare'],
# row['Fare'] / (family_size + 1),
]
result.append(datum)
return result