in code/workflow/implementations/autopilot/bp_bias_analysis_stage.py [0:0]
def create_merged_dataset(s3_src, s3_dst, target_name) :
parsed = urlparse(s3_src, allow_fragments=False)
if parsed.query:
prefix= parsed.path.lstrip('/') + '?' + parsed.query
else:
prefix= parsed.path.lstrip('/')
files = []
kwargs = {'Bucket': parsed.netloc, 'Prefix': prefix, 'MaxKeys': 100}
resp = s3.list_objects_v2(**kwargs)
is_first = True
for obj in resp['Contents'] :
if is_first :
files.append(pd.read_csv("s3://{}/{}".format(parsed.netloc, obj["Key"])))
else :
files.append(pd.read_csv("s3://{}/{}".format(parsed.netloc, obj["Key"], skiprows=1, header=None)))
df = pd.concat(files)
df[target_name] = df[target_name].astype(int)
dst = f"{s3_dst}/merged.csv"
df.to_csv(dst, index=False)
return dst