pysparksamples/clinical_job.py [40:91]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

## Get the argumentlist

args=getResolvedOptions(sys.argv,
                        ['JOB_NAME',
                        'project',
                        'output_bucket'])
                        
## The GDC endpoint for files and the NCI endpoint to query for the S3 URL

files_endpt = 'https://api.gdc.cancer.gov/files'
indexd_endpt = 'https://nci-crdc.datacommons.io/index/index/'

s3_tcga_bucket = 'tcga-2-open'
s3 = boto3.resource('s3')
output_bucket= args['output_bucket']

project_id = args['project']


## method to query the NCI endpoint for the S3 path
## Inputs to this method are the UUID and submitter ID from the GDC endpoint query

def get_data(uuid, sample_submitter_id):
    query_response = requests.get(indexd_endpt + "/" + uuid)
    urls_response = json.loads(query_response.content.decode("utf-8"))["urls"]
    url = [x for x in urls_response if x.startswith("s3://")]
    if len(url) != 1:
        print("Something weird with UUID " + uuid + "returned " + str(url))
    url = url[0]
    return url

## Fields to be returned as a comma separated list

fields = [
      "file_name"
    , "cases.primary_site"
    , "cases.case_id"
    , "cases.project.project_id"
    , "cases.submitter_id"
    , "cases.samples.submitter_id"
    , "cases.samples.sample_id"

]


size = 5000
fields = ','.join(fields)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


pysparksamples/copy_number_job.py [39:90]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

## Get the argumentlist

args=getResolvedOptions(sys.argv,
                        ['JOB_NAME',
                        'project',
                        'output_bucket'])
                        

## The GDC endpoint for files and the NCI endpoint to query for the S3 URL

files_endpt = 'https://api.gdc.cancer.gov/files'
indexd_endpt = 'https://nci-crdc.datacommons.io/index/index/'

s3_tcga_bucket = 'tcga-2-open'
s3 = boto3.resource('s3')
output_bucket= args['output_bucket']

project_id = args['project']

## method to query the NCI endpoint for the S3 path
## Inputs to this method are the UUID and submitter ID from the GDC endpoint query

def get_data(uuid, sample_submitter_id):
    query_response = requests.get(indexd_endpt + "/" + uuid)
    urls_response = json.loads(query_response.content.decode("utf-8"))["urls"]
    url = [x for x in urls_response if x.startswith("s3://")]
    if len(url) != 1:
        print("Something weird with UUID " + uuid + "returned " + str(url))
    url = url[0]
    return url
    
## Fields to be returned as a comma separated list

fields = [
      "file_name"
    , "cases.primary_site"
    , "cases.case_id"
    , "cases.project.project_id"
    , "cases.submitter_id"
    , "cases.samples.submitter_id"
    , "cases.samples.sample_id"

]


size = 5000
fields = ','.join(fields)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -