in aws_emr_launch/constructs/emr_constructs/cluster_configuration.py [0:0]
def add_spark_jars(self, code: emr_code.EMRCode, jars_in_code: List[str]) -> "ClusterConfiguration":
if self._rehydrated:
raise ReadOnlyClusterConfigurationError()
self._configuration_artifacts.append(
{"Bucket": code.deployment_bucket.bucket_name, "Path": os.path.join(code.deployment_prefix, "*")}
)
# We use a nested Construct to avoid Construct id collisions
# First generate an ID for the Construct from bucket_name and deployment_prefix
# We use a Hash to avoid potential problems with Tokens in the bucket_name
hasher = hashlib.md5()
hasher.update(os.path.join(code.deployment_bucket.bucket_name, code.deployment_prefix).encode("utf-8"))
token = base64.urlsafe_b64encode(hasher.digest()).decode()
construct_id = f"EmrCode_SparkJar_{token}"
# Then attempt to find a previous Construct with this id
construct: Optional[core.Construct] = cast(Optional[core.Construct], self.node.try_find_child(construct_id))
# If we didn't find a previous Construct, construct a new one
construct = core.Construct(self, construct_id) if construct is None else construct
bucket_path = code.resolve(construct)["S3Path"]
for jar in jars_in_code:
self._spark_jars.append(os.path.join(bucket_path, jar))
config = self.config
config["Configurations"] = self.update_configurations(
config["Configurations"], "spark-defaults", {"spark.jars": ",".join(self._spark_jars)}
)
self.update_config(config)
return self