parquet_cli/ingest_s3/__main__.py (68 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import logging
import os
os.environ['master_spark_url'] = ''
os.environ['spark_app_name'] = ''
os.environ['parquet_file_name'] = ''
os.environ['in_situ_schema'] = ''
os.environ['authentication_type'] = ''
os.environ['authentication_key'] = ''
os.environ['parquet_metadata_tbl'] = ''
from parquet_flask.cdms_lambda_func.lambda_func_env import LambdaFuncEnv
class IngestS3Entry:
BUCKET_NAME_KEY = 'BUCKET_NAME'
KEY_PREFIX_KEY = 'KEY_PREFIX'
def __init__(self):
self.__a = ''
def __get_args(self) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Ingesting 1 or more S3 files into Parquet. Note that AWS environment variables should be set before running this")
parser.add_argument(f'--{LambdaFuncEnv.CDMS_DOMAIN}',
help="CDMS Flask domain where ingestion endpoint resides. Need to include `/insitu` prefix",
metavar="http://localhost:9801/insitu",
required=True)
parser.add_argument(f'--{LambdaFuncEnv.CDMS_BEARER_TOKEN}',
help="plain-text security token that is set in CDMS Flask pod during K8s deployment. Check in Dockerfile",
metavar="mock-token",
required=True)
parser.add_argument(f'--{LambdaFuncEnv.PARQUET_META_TBL_NAME}',
help="dynamo DB table where parquet file ingestion records are stored. Check in Values.yaml",
metavar="cdms_parquet_meta_dev_v1",
required=True)
parser.add_argument(f'--{self.BUCKET_NAME_KEY}',
help="name of S3 bucket",
metavar="icoads-bucket",
required=True)
parser.add_argument(f'--{self.KEY_PREFIX_KEY}',
help="s3 prefix. It will ingest all files starting with this prefix. If all filees need to be ingested, pass empty value. If only 1 file needs to be ingested, pass the exact file path",
metavar='2021/01/01/samplefile.json.gz',
required=True)
parser.add_argument(f'--{LambdaFuncEnv.LOG_LEVEL}',
help="python log level in integer.",
default='10',
metavar='10',
required=False)
return parser.parse_args()
def start(self):
options = self.__get_args()
logging.basicConfig(level=int(getattr(options, LambdaFuncEnv.LOG_LEVEL)),
format="%(asctime)s [%(levelname)s] [%(name)s::%(lineno)d] %(message)s")
os.environ[LambdaFuncEnv.LOG_LEVEL] = getattr(options, LambdaFuncEnv.LOG_LEVEL)
os.environ[LambdaFuncEnv.CDMS_DOMAIN] = getattr(options, LambdaFuncEnv.CDMS_DOMAIN)
os.environ[LambdaFuncEnv.CDMS_BEARER_TOKEN] = base64.standard_b64encode(getattr(options, LambdaFuncEnv.CDMS_BEARER_TOKEN).encode()).decode()
os.environ[LambdaFuncEnv.PARQUET_META_TBL_NAME] = getattr(options, LambdaFuncEnv.PARQUET_META_TBL_NAME)
bucket_name = getattr(options, self.BUCKET_NAME_KEY)
key_prefix = getattr(options, self.KEY_PREFIX_KEY)
from parquet_flask.cdms_lambda_func.ingest_s3_to_cdms.ingest_s3_to_cdms import IngestS3ToCdms
from parquet_flask.aws.aws_s3 import AwsS3
s3 = AwsS3()
for key, size in s3.get_child_s3_files(bucket_name, key_prefix,
lambda x: x['Key'].endswith('.json') or x['Key'].endswith('.json.gz')):
try:
print(f'working on: {key}')
IngestS3ToCdms().start(event={'s3_url': f's3://{bucket_name}/{key}'})
except Exception as e:
print(f'error while processing: s3://{bucket_name}/{key}. details: {str(e)}')
return
if __name__ == '__main__':
"""
Sample usage:
python3 -m parquet_cli.ingest_s3 \
--LOG_LEVEL 30 \
--CDMS_DOMAIN https://doms.jpl.nasa.gov/insitu \
--CDMS_BEARER_TOKEN Mock-Token \
--PARQUET_META_TBL_NAME cdms_parquet_meta_dev_v1 \
--BUCKET_NAME cdms-dev-ncar-in-situ-stage \
--KEY_PREFIX cdms_icoads_2017-01-01.json
"""
IngestS3Entry().start()