Spark Custom Config
# Defining Parameters
s3_access = None
s3_secret = None
s3_bucket_uri = None # E.g "s3a://sample-bucket/boston.csv"
assert s3_access, "Please enter an access key"
assert s3_secret, "Please enter an secret key"
assert s3_bucket_uri, "Please enter s3 bucket uri"
# For AWS S3
s3_endpoint = "s3.amazonaws.com"
# For others, e.g. Minio
# s3_endpoint = "http://prt-svc-sampleobj.prt-ns.svc.cluster.local"
extra_spark_conf = {
"spark.hadoop.fs.s3a.path.style.access": "true",
"spark.hadoop.fs.s3a.access.key" : s3_access,
"spark.hadoop.fs.s3a.secret.key" : s3_secret,
"spark.hadoop.fs.s3a.endpoint": s3_endpoint
}
import practicuscore as prt
spark = prt.engines.get_spark_session(extra_spark_conf=extra_spark_conf)
df = spark.read.csv(s3_bucket_uri)
df.head()
Previous: Insurance With Remote Worker | Next: Spark Object Storage