in archived/fraud_detection_using_graph_neural_networks/data-generation/generate_data.py [0:0]
def gen_fraud_data(num_unique_ccs=NUM_UNIQUE_CCS, start_trans_date=START_TRANS_DATE, end_trans_date=END_TRANS_DATE):
fake = Faker()
cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)]
cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)]
num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32)
cc_ipv4 = [fake.ipv4() for _ in range(num_unique_ccs)]
cc_phone_number = [fake.phone_number()for _ in range(num_unique_ccs)]
cc_device_id = [fake.msisdn()for _ in range(num_unique_ccs)]
data = {
'TransactionID': [fake.uuid4() for _ in range(sum(num_trans_per_cc))],
'TransactionDT': [fake.date_time_between_dates(datetime_start=start_trans_date, datetime_end=end_trans_date)
for _ in range(sum(num_trans_per_cc))],
'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])),
'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])),
'email_domain': [fake.ascii_email().split("@")[1] for _ in range(sum(num_trans_per_cc))],
'ProductCD': np.random.choice(['45', 'AB', 'L', 'Y', 'T'], size=sum(num_trans_per_cc)),
'TransactionAmt': np.abs(np.ceil(np.random.exponential(scale=10, size=sum(num_trans_per_cc))*100)).astype(np.int32),
}
transactions = pd.DataFrame(data).sort_values(by=['TransactionDT'])
# if you want to make the # of observations in the identity table less than that in the transactions table which may be more realistic in a practical scenario, change the size argument below.
identity_transactions_idx = np.random.choice(transactions.shape[0], size=int(transactions.shape[0]*1.0), replace=False)
id_data = {
'IpAddress': list(itertools.chain.from_iterable([[ipv4]*num_trans for ipv4, num_trans in zip(cc_ipv4, num_trans_per_cc)])),
'PhoneNo' : list(itertools.chain.from_iterable([[phone_num]*num_trans for phone_num, num_trans in zip(cc_phone_number, num_trans_per_cc)])),
'DeviceID': list(itertools.chain.from_iterable([[device_id]*num_trans for device_id, num_trans in zip(cc_device_id, num_trans_per_cc)])),
}
identity = pd.DataFrame(id_data)
identity["TransactionID"] = transactions.TransactionID
assert identity.shape[0] == transactions.shape[0]
identity = identity.loc[identity_transactions_idx]
identity.reset_index(drop=True, inplace=True)
identity = identity[["TransactionID", "IpAddress", "PhoneNo", "DeviceID"]]
identity = pd.DataFrame(id_data)
# join two tables for the convenience of generating label column 'isFraud'
full_two_df = transactions[["TransactionID", "card_no", "card_type", "email_domain", "ProductCD", "TransactionAmt"]].merge(identity, on='TransactionID', how='left')
is_fraud = []
for idx, row in full_two_df.iterrows():
card_no, card_type, email, product_type, transcation_amount, ip_address, phone_no, device_id = str(row["card_no"]), row["card_type"], row["email_domain"], row["ProductCD"], row["TransactionAmt"], str(row["IpAddress"]), str(row["PhoneNo"]), str(row["DeviceID"])
if email in ["hotmail.com", "gmail.com", "yahoo.com"]:
if product_type in ["45"]:
is_fraud.append(int(np.random.uniform() < 0.9))
else:
if (device_id != "nan") and (device_id.endswith("16") or device_id.endswith("78") or device_id.endswith("23")):
is_fraud.append(int(np.random.uniform() < 0.1))
else:
is_fraud.append(int(np.random.uniform() < 0.05))
else:
if transcation_amount > 3000:
is_fraud.append(int(np.random.uniform() < 0.8))
else:
if card_type in ["Diners Club / Carte Blanche", "JCB 15 digit", "Maestro"]: # about 35,000 observations are in this categires
if (card_no.endswith("001") or card_no.endswith("002") or card_no.endswith("003") or card_no.endswith("004") or card_no.endswith("005") or card_no.endswith("007") or card_no.endswith("008") or card_no.endswith("009")) or ((phone_no != "nan") and (phone_no.endswith(".227") or phone_no.endswith(".104") or phone_no.endswith(".251") or phone_no.endswith(".181"))):
is_fraud.append(int(np.random.uniform() < 0.3))
else:
if (ip_address != "nan") and (ip_address.endswith(".227") or ip_address.endswith(".104") or ip_address.endswith(".251") or ip_address.endswith(".181")):
is_fraud.append(int(np.random.uniform() < 0.2))
else:
is_fraud.append(int(np.random.uniform() < 0.1))
else:
is_fraud.append(int(np.random.uniform() < 0.0001))
print("fraud ratio", sum(is_fraud)/ len(is_fraud))
transactions['isFraud'] = is_fraud
return transactions, identity