in sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py [0:0]
def get_df(file_name=None):
fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv'
if file_name:
fn = file_name
df = pd.read_csv(fn, header=[0,1], skipinitialspace=True)
process_header(df)
df = df.rename(columns={
"How long have you been contributing to Kubernetes?": "Contributing_Length",
"What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
"What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
"What region of the world are you in?": "World_Region",
"Are you interested in advancing to the next level of the Contributor Ladder?": "Interested_in_next_level",
"How many other open source projects not in the Kubernetes ecosystem do you contribute to? (example: nodejs, debian)":"Contribute_to_other_OSS",
"Does your employer support your contributions to Kubernetes?":"Upstream_supported_at_employer",
"Blocker: Other (please specify)": "Other blockers (please specify)",
"What region of the world are you in?": "World Region",
})
def map_blocker_and_usefreq_vals(val):
try:
return int(val)
except ValueError:
return int(val[0])
#Clean Data
for x in df.columns:
if x.startswith("Useful:"):
df = df.assign(**{x: df[x].fillna(0)})
if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Would attend if:"):
df = df.assign(**{x: np.where(df[x].isna(),0,1)})
if x.startswith('Upstream'):
df = df.assign(**{x: df[x].fillna("Didn't Answer")})
if x.startswith("Blocker:") and x != "Blocker: Other (please specify)":
df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
if x.startswith("Use freq:") or x.startswith("Agree:"):
df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
df = df.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in df.columns})
x = pd.to_datetime(df.End_Date)
df = df.assign(date_taken = x.dt.date)
return df