2

I have created network

docker network create app-tier --driver bridge

and used this docker compose file

networks:
  default:
    external:
      name: app-tier

services:
  minio:
    image: 'bitnami/minio:latest'
    container_name: my-minio-server
    environment:
      - MINIO_ROOT_USER=theroot
      - MINIO_ROOT_PASSWORD=theroot123
    ports:
      - '9000:9000'
      - '9001:9001'
    volumes:
      - ${HOME}/minio/data:/data
  spark:
    image: docker.io/bitnami/spark:3
    environment:
      - SPARK_MODE=master
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    ports:
      - '8080:8080'
      - '7077:7077'
    volumes:
      - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
  spark-worker1:
    image: docker.io/bitnami/spark:3
    links:
      - "spark:spark"
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark:7077
      - SPARK_WORKER_MEMORY=1G
      - SPARK_WORKER_CORES=1
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    ports:
      - '7181:8081'
    volumes:
      - ./work1:/opt/bitnami/spark/work
      - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
  spark-worker2:
    image: docker.io/bitnami/spark:3
    links:
      - "spark:spark"
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark:7077
      - SPARK_WORKER_MEMORY=1G
      - SPARK_WORKER_CORES=1
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    ports:
      - '7182:8082'
    volumes:
      - ./work2:/opt/bitnami/spark/work
      - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf

I connected to minio at http://127.0.0.1:9001 with the above credentials and I created a service account and an "asiatrip" bucket.

It has the following

s3accessKeyAws = "n1Z8USynE2uOBJmc"
s3secretKeyAws = "RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx"

I can successfully connect to it via the minio client

docker run -it --rm --name minio-client \
    --env MINIO_SERVER_HOST="my-minio-server" \
    --env MINIO_SERVER_ACCESS_KEY="theroot" \
    --env MINIO_SERVER_SECRET_KEY="theroot123" \
    --network app-tier --volume $HOME/mcconf:/.mc  \
    bitnami/minio-client alias set minio http://my-minio-server:9000 n1Z8USynE2uOBJmc RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx --api S3v4

and

docker run -it --rm --name minio-client \
    --env MINIO_SERVER_HOST="my-minio-server" \
    --env MINIO_SERVER_ACCESS_KEY="theroot" \
    --env MINIO_SERVER_SECRET_KEY="theroot123" \
    --network app-tier --volume $HOME/mcconf:/.mc  \
    bitnami/minio-client ls minio

I also can use minio via a docker jupyter in that network

docker run -it --network app-tier -p 8888:8888 jupyter/scipy-notebook:latest

after installing minio package with

!pip install minio

and execute python script

from minio import Minio
from minio.error import S3Error
client = Minio(
    "my-minio-server:9000",
    access_key="n1Z8USynE2uOBJmc",
    secret_key="RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx",
    secure=False,
)

# Make 'asiatrip' bucket if not exist.
found = client.bucket_exists("asiatrip")
if not found:
    client.make_bucket("asiatrip")
else:
    print("Bucket 'asiatrip' already exists")


list(client.list_objects("asiatrip"))

So everything seems set

I installed hadoop-3.3.2 and spark-3.2.1-bin-without-hadoop

I setup my env as follows

export HADOOP_HOME=$HOME/Downloads/hadoop-3.3.2
export SPARK_HOME=$HOME/Downloads/spark-3.2.1-bin-without-hadoop
export PATH=$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH
export HADOOP_OPTIONAL_TOOLS="hadoop-aws"
export SPARK_DIST_CLASSPATH=$(hadoop classpath)

when I run this python file as

from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("Test json")\
    .getOrCreate()

s3accessKeyAws = "n1Z8USynE2uOBJmc"
s3secretKeyAws = "RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx"
connectionTimeOut = "1000"
s3endPointLoc = "http://127.0.0.1:9000"
sourceBucket = "asiatrip"

spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", s3endPointLoc)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", s3accessKeyAws)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", s3secretKeyAws)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", connectionTimeOut)
spark.sparkContext._jsc.hadoopConfiguration().set("spark.sql.debug.maxToStringFields", "100")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")

inputPath = f"s3a://{sourceBucket}/addresses.csv"
outputPath = f"s3a://{sourceBucket}/output_survey.csv"
df = spark.read.option("header", "true").format("s3selectCSV").csv(inputPath)
df.write.mode("overwrite").parquet(outputPath)
spark.stop()

as

spark-submit miniospark.py

it works fine for the addresses.csv file

a,b
1,2
3,4
6,7
8,9

in asiatrip bucket.

When I submit as

spark-submit --master spark://127.0.0.1:7077  miniospark.py

with

s3endPointLoc = "http://my-minio-server:9000"

It gives up after some time because it cannot resolve my-minio-server.

2022-05-18 15:12:32,246 WARN streaming.FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: s3a://asiatrip/addresses.csv.
org.apache.hadoop.fs.s3a.AWSClientIOException: getFileStatus on s3a://asiatrip/addresses.csv: com.amazonaws.SdkClientException: Unable to execute HTTP request: my-minio-server: nodename nor servname provided, or not known: Unable to execute HTTP request: my-minio-server: nodename nor servname provided, or not known

I am on a Mac x64 with Docker Desktop

1 Answers1

0

After a lot of trial and error I have a solution

Do yourself a favor and modify in /etc/hosts the line

127.0.0.1   localhost

to

127.0.0.1   localhost my-minio-server