def preprocess()

in functions/source/preprocess/preprocess.py [0:0]


def preprocess(obj):

    '''
    Recieves s3 object from boto3, converts to excell sheet, 
    and then converts to 3 dataframes.

    changes the dataframes column names to lower case, 
    and joins organism dataframe with patient dataframe.

    ----------
    fieldname : obj

    fieldname: s3 object


    Returns
    -------
    dataframe for patients, dataframe for temperature    
    '''
    
    data = obj['Body'].read()
    xls = pd.read_excel(data,None,engine='openpyxl')
    dataframe_patients = xls['Sheet1']
    dataframe_temperature = xls['Sheet2']
    dataframe_organism = xls['Sheet3']
    dataframe_patients.rename(str.lower, axis='columns',inplace=True)
    dataframe_temperature.rename(str.lower, axis='columns',inplace=True)
    dataframe_organism.rename(str.lower, axis='columns', inplace=True)
    
    dataframe_organism.set_index(['mrn','encntr_num'], inplace=True)

    for i in dataframe_patients.index:
        mrn = dataframe_patients.loc[i,'mrn']
        enct_num = dataframe_patients.loc[i, 'encntr_num']
        dataframe_patients.loc[i, 'organism'] = ",\n".join(dataframe_organism.loc[(mrn,enct_num),'organism_desc_src'].unique())
        
    return dataframe_patients, dataframe_temperature