def get_df()

in sig-contributor-experience/surveys/k8s_survey_analysis/prepare_2019.py [0:0]


def get_df(file_name=None):
    fn = '2019_survey/2019 Kubernetes Contributor Experience Survey PUBLIC.csv'
    if file_name:
        fn = file_name   

    df = pd.read_csv(fn, header=[0,1], skipinitialspace=True)
    process_header(df)

    df = df.rename(columns={
        "How long have you been contributing to Kubernetes?": "Contributing_Length",
        "What level of the Contributor Ladder do you consider yourself to be on? (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
        "What level of the Contributor Ladder do you consider yourself to be on?  (pick the highest if you are in multiple OWNERs files)": "Level_of_Contributor",
        "What region of the world are you in?": "World_Region",
        "Are you interested in advancing to the next level of the Contributor Ladder?": "Interested_in_next_level",
        "How many other open source projects not in the Kubernetes ecosystem do you contribute to? (example: nodejs, debian)":"Contribute_to_other_OSS",
        "Does your employer support your contributions to Kubernetes?":"Upstream_supported_at_employer",
        "Blocker: Other (please specify)": "Other blockers (please specify)",
        "What region of the world are you in?": "World Region",
    })

    def map_blocker_and_usefreq_vals(val):
        try:
            return int(val)
        except ValueError:
            return int(val[0])

    #Clean Data
    for x in df.columns:
        if x.startswith("Useful:"):
            df = df.assign(**{x: df[x].fillna(0)})
        if x.startswith("Contribute:") or x.startswith("Check for news:") or x.startswith("Attended:") or x.startswith("Attending:") or x.startswith("Would attend if:"):
            df = df.assign(**{x: np.where(df[x].isna(),0,1)})
        if x.startswith('Upstream'):
            df = df.assign(**{x: df[x].fillna("Didn't Answer")})
        if x.startswith("Blocker:") and x != "Blocker: Other (please specify)":
            df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
        if x.startswith("Use freq:") or x.startswith("Agree:"):
            df[x] = df[x].map(map_blocker_and_usefreq_vals, na_action="ignore")
        

    df = df.rename(columns= {x:x.replace(" ","_").replace("?", "").replace('Most_Important_Project','Most_Important_Proj').replace('Most_Important_Prj','Most_Important_Proj') for x in df.columns})

    x = pd.to_datetime(df.End_Date)
    df = df.assign(date_taken = x.dt.date)

    return df