pysparksamples/clinical_job.py [97:154]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
filters = {
    "op":"and",
    "content":[
        {"op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": [project_id]
            }
        },
        {"op": "in",
        "content":{
            "field": "files.data_type",
            "value": [data_type]
            }
        },
        {"op": "in",
        "content":{
            "field": "files.data_category",
            "value": [data_category]
            }
        },
        {"op": "in",
        "content":{
            "field": "files.data_format",
            "value": [data_format]
            }
        }
    ]
}

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "JSON",
    "size": size
    }

## query the files endpoint and get back JSON response

query_response = requests.get(files_endpt, params = params)

json_response = json.loads(query_response.content.decode("utf-8"))["data"]["hits"]

## Parallel read of JSON object

df = spark.read.json(sc.parallelize([json_response]))
#df2 = df.repartition(8)
uf = df.select("id","cases.submitter_id")

urldf = udf(get_data)

## construct list of S3 input paths

inputpath=uf.withColumn('Result', urldf('id', 'submitter_id'))
inputlist = list(inputpath.select('Result').toPandas()['Result'])
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


pysparksamples/mutation_job.py [98:155]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
filters = {
    "op":"and",
    "content":[
        {"op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": [project_id]
            }
        },
        {"op": "in",
        "content":{
            "field": "files.data_type",
            "value": [data_type]
            }
        },
        {"op": "in",
        "content":{
            "field": "files.data_category",
            "value": [data_category]
            }
        },
        {"op": "in",
        "content":{
            "field": "files.data_format",
            "value": [data_format]
            }
        }
    ]
}

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "JSON",
    "size": size
    }

## query the files endpoint and get back JSON response

query_response = requests.get(files_endpt, params = params)

json_response = json.loads(query_response.content.decode("utf-8"))["data"]["hits"]

## Parallel read of JSON object

df = spark.read.json(sc.parallelize([json_response]))
#df2 = df.repartition(8)
uf = df.select("id","cases.submitter_id")

urldf = udf(get_data)

## construct list of S3 input paths

inputpath=uf.withColumn('Result', urldf('id', 'submitter_id'))
inputlist = list(inputpath.select('Result').toPandas()['Result'])
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -