def create_merged_dataset()

in code/workflow/implementations/autopilot/bp_bias_analysis_stage.py [0:0]


def create_merged_dataset(s3_src, s3_dst, target_name) :

    parsed = urlparse(s3_src, allow_fragments=False)
    if parsed.query:
        prefix= parsed.path.lstrip('/') + '?' + parsed.query
    else:
        prefix= parsed.path.lstrip('/')
    
    files = []
    kwargs = {'Bucket': parsed.netloc, 'Prefix': prefix, 'MaxKeys': 100}
    resp = s3.list_objects_v2(**kwargs)
    is_first = True
    for obj in resp['Contents'] :
        if is_first :
            files.append(pd.read_csv("s3://{}/{}".format(parsed.netloc, obj["Key"])))
        else :
            files.append(pd.read_csv("s3://{}/{}".format(parsed.netloc, obj["Key"], skiprows=1, header=None)))    

    df = pd.concat(files)
    df[target_name] = df[target_name].astype(int)
    dst = f"{s3_dst}/merged.csv"
    df.to_csv(dst, index=False)
    return dst