in src/smallmatter/typecheck/_pdcheck.py [0:0]
def check_columns(df, max_item_to_show=10):
"""Column dtype are computed from non-NaN values to prevent int64 columns becomes float64."""
column = []
dtyp = []
uniq_cnt = []
data_cnt = []
nan_cnt = []
sample_value = []
d = {
"column": column,
"dtype": dtyp,
"uniq_cnt": uniq_cnt,
"data_cnt": data_cnt,
"nan_cnt": nan_cnt,
"sample_value": sample_value,
}
for i in df.columns:
col = df[i]
uniques = col.unique()
cnt = len(col)
column.append(i)
dtyp.append(col.dropna().dtype)
uniq_cnt.append(len(uniques))
nan_cnt.append(cnt - col.count())
data_cnt.append(cnt)
# Convert to string, otherwise jupyter notebook display without padding spaces
# sample_value.append(str(uniques[:max_item_to_show].tolist()))
sample_value.append(json.dumps(uniques[:max_item_to_show].tolist()))
return pd.DataFrame(d, columns=["column", "dtype", "uniq_cnt", "data_cnt", "nan_cnt", "sample_value"])