I have created network
docker network create app-tier --driver bridge
and used this docker compose file
networks:
default:
external:
name: app-tier
services:
minio:
image: 'bitnami/minio:latest'
container_name: my-minio-server
environment:
- MINIO_ROOT_USER=theroot
- MINIO_ROOT_PASSWORD=theroot123
ports:
- '9000:9000'
- '9001:9001'
volumes:
- ${HOME}/minio/data:/data
spark:
image: docker.io/bitnami/spark:3
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- '8080:8080'
- '7077:7077'
volumes:
- ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
spark-worker1:
image: docker.io/bitnami/spark:3
links:
- "spark:spark"
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- '7181:8081'
volumes:
- ./work1:/opt/bitnami/spark/work
- ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
spark-worker2:
image: docker.io/bitnami/spark:3
links:
- "spark:spark"
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- '7182:8082'
volumes:
- ./work2:/opt/bitnami/spark/work
- ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
I connected to minio at http://127.0.0.1:9001 with the above credentials and I created a service account and an "asiatrip" bucket.
It has the following
s3accessKeyAws = "n1Z8USynE2uOBJmc"
s3secretKeyAws = "RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx"
I can successfully connect to it via the minio client
docker run -it --rm --name minio-client \
--env MINIO_SERVER_HOST="my-minio-server" \
--env MINIO_SERVER_ACCESS_KEY="theroot" \
--env MINIO_SERVER_SECRET_KEY="theroot123" \
--network app-tier --volume $HOME/mcconf:/.mc \
bitnami/minio-client alias set minio http://my-minio-server:9000 n1Z8USynE2uOBJmc RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx --api S3v4
and
docker run -it --rm --name minio-client \
--env MINIO_SERVER_HOST="my-minio-server" \
--env MINIO_SERVER_ACCESS_KEY="theroot" \
--env MINIO_SERVER_SECRET_KEY="theroot123" \
--network app-tier --volume $HOME/mcconf:/.mc \
bitnami/minio-client ls minio
I also can use minio via a docker jupyter in that network
docker run -it --network app-tier -p 8888:8888 jupyter/scipy-notebook:latest
after installing minio package with
!pip install minio
and execute python script
from minio import Minio
from minio.error import S3Error
client = Minio(
"my-minio-server:9000",
access_key="n1Z8USynE2uOBJmc",
secret_key="RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx",
secure=False,
)
# Make 'asiatrip' bucket if not exist.
found = client.bucket_exists("asiatrip")
if not found:
client.make_bucket("asiatrip")
else:
print("Bucket 'asiatrip' already exists")
list(client.list_objects("asiatrip"))
So everything seems set
I installed hadoop-3.3.2 and spark-3.2.1-bin-without-hadoop
I setup my env as follows
export HADOOP_HOME=$HOME/Downloads/hadoop-3.3.2
export SPARK_HOME=$HOME/Downloads/spark-3.2.1-bin-without-hadoop
export PATH=$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH
export HADOOP_OPTIONAL_TOOLS="hadoop-aws"
export SPARK_DIST_CLASSPATH=$(hadoop classpath)
when I run this python file as
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.appName("Test json")\
.getOrCreate()
s3accessKeyAws = "n1Z8USynE2uOBJmc"
s3secretKeyAws = "RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx"
connectionTimeOut = "1000"
s3endPointLoc = "http://127.0.0.1:9000"
sourceBucket = "asiatrip"
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", s3endPointLoc)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", s3accessKeyAws)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", s3secretKeyAws)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", connectionTimeOut)
spark.sparkContext._jsc.hadoopConfiguration().set("spark.sql.debug.maxToStringFields", "100")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")
inputPath = f"s3a://{sourceBucket}/addresses.csv"
outputPath = f"s3a://{sourceBucket}/output_survey.csv"
df = spark.read.option("header", "true").format("s3selectCSV").csv(inputPath)
df.write.mode("overwrite").parquet(outputPath)
spark.stop()
as
spark-submit miniospark.py
it works fine for the addresses.csv file
a,b
1,2
3,4
6,7
8,9
in asiatrip bucket.
When I submit as
spark-submit --master spark://127.0.0.1:7077 miniospark.py
with
s3endPointLoc = "http://my-minio-server:9000"
It gives up after some time because it cannot resolve my-minio-server.
2022-05-18 15:12:32,246 WARN streaming.FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: s3a://asiatrip/addresses.csv.
org.apache.hadoop.fs.s3a.AWSClientIOException: getFileStatus on s3a://asiatrip/addresses.csv: com.amazonaws.SdkClientException: Unable to execute HTTP request: my-minio-server: nodename nor servname provided, or not known: Unable to execute HTTP request: my-minio-server: nodename nor servname provided, or not known
I am on a Mac x64 with Docker Desktop