in cluster-trace-gpu-v2020/analysis/utils.py [0:0]
def get_dfa(dft, dfj, dfi, dfg):
print('dft + dfj ...')
dfa = dft.merge(dfj, on=['job_name'], suffixes = ['','_j'])
dfa.loc[dfa.start_time==0, 'start_time'] = np.nan
dfa.loc[dfa.start_time==0, 'end_time'] = np.nan
dfa['runtime'] = dfa.end_time - dfa.start_time
print('dft + dfj + dfi ...')
dfia = get_dfia(dfi)
dfa = dfa.merge(dfia, on=['job_name','task_name'], suffixes=['','_i'])
dfa['duration_min'] = dfa.runtime_i / 60 # duration of instances
dfa['wait_time'] = dfa.start_time_i - dfa.start_time # task wait time
dfa['start_date']=dfa.start_time.apply(pd.Timestamp, unit='s', tz='Asia/Shanghai') # task start time
# dfa = dfa[dfa.status=='Terminated']
print('dft + dfj + dfi + dfg ...')
dfa = dfa.merge(dfg[[x for x in dfg.columns if x != 'user']], on='inst_id', how='left') # reserve NaN ones by how='left'
dfa.loc[dfa.group.isnull(),'group'] = dfa.loc[dfa.group.isnull(), 'user'] # fill group==NaN ones with user
return dfa