I have created network
docker network create app-tier --driver bridge and used this docker compose file
networks: default: external: name: app-tier services: minio: image: 'bitnami/minio:latest' container_name: my-minio-server environment: - MINIO_ROOT_USER=theroot - MINIO_ROOT_PASSWORD=theroot123 ports: - '9000:9000' - '9001:9001' volumes: - ${HOME}/minio/data:/data spark: image: docker.io/bitnami/spark:3 environment: - SPARK_MODE=master - SPARK_RPC_AUTHENTICATION_ENABLED=no - SPARK_RPC_ENCRYPTION_ENABLED=no - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - SPARK_SSL_ENABLED=no ports: - '8080:8080' - '7077:7077' volumes: - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf spark-worker1: image: docker.io/bitnami/spark:3 links: - "spark:spark" environment: - SPARK_MODE=worker - SPARK_MASTER_URL=spark://spark:7077 - SPARK_WORKER_MEMORY=1G - SPARK_WORKER_CORES=1 - SPARK_RPC_AUTHENTICATION_ENABLED=no - SPARK_RPC_ENCRYPTION_ENABLED=no - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - SPARK_SSL_ENABLED=no ports: - '7181:8081' volumes: - ./work1:/opt/bitnami/spark/work - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf spark-worker2: image: docker.io/bitnami/spark:3 links: - "spark:spark" environment: - SPARK_MODE=worker - SPARK_MASTER_URL=spark://spark:7077 - SPARK_WORKER_MEMORY=1G - SPARK_WORKER_CORES=1 - SPARK_RPC_AUTHENTICATION_ENABLED=no - SPARK_RPC_ENCRYPTION_ENABLED=no - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - SPARK_SSL_ENABLED=no ports: - '7182:8082' volumes: - ./work2:/opt/bitnami/spark/work - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf I connected to minio at http://127.0.0.1:9001 with the above credentials and I created a service account and an "asiatrip" bucket.
It has the following
s3accessKeyAws = "n1Z8USynE2uOBJmc" s3secretKeyAws = "RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx" I can successfully connect to it via the minio client
docker run -it --rm --name minio-client \ --env MINIO_SERVER_HOST="my-minio-server" \ --env MINIO_SERVER_ACCESS_KEY="theroot" \ --env MINIO_SERVER_SECRET_KEY="theroot123" \ --network app-tier --volume $HOME/mcconf:/.mc \ bitnami/minio-client alias set minio http://my-minio-server:9000 n1Z8USynE2uOBJmc RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx --api S3v4 and
docker run -it --rm --name minio-client \ --env MINIO_SERVER_HOST="my-minio-server" \ --env MINIO_SERVER_ACCESS_KEY="theroot" \ --env MINIO_SERVER_SECRET_KEY="theroot123" \ --network app-tier --volume $HOME/mcconf:/.mc \ bitnami/minio-client ls minio I also can use minio via a docker jupyter in that network
docker run -it --network app-tier -p 8888:8888 jupyter/scipy-notebook:latest after installing minio package with
!pip install minio and execute python script
from minio import Minio from minio.error import S3Error client = Minio( "my-minio-server:9000", access_key="n1Z8USynE2uOBJmc", secret_key="RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx", secure=False, ) # Make 'asiatrip' bucket if not exist. found = client.bucket_exists("asiatrip") if not found: client.make_bucket("asiatrip") else: print("Bucket 'asiatrip' already exists") list(client.list_objects("asiatrip")) So everything seems set
I installed hadoop-3.3.2 and spark-3.2.1-bin-without-hadoop
I setup my env as follows
export HADOOP_HOME=$HOME/Downloads/hadoop-3.3.2 export SPARK_HOME=$HOME/Downloads/spark-3.2.1-bin-without-hadoop export PATH=$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH export HADOOP_OPTIONAL_TOOLS="hadoop-aws" export SPARK_DIST_CLASSPATH=$(hadoop classpath) when I run this python file as
from pyspark.sql import SparkSession spark = SparkSession\ .builder\ .appName("Test json")\ .getOrCreate() s3accessKeyAws = "n1Z8USynE2uOBJmc" s3secretKeyAws = "RjK4uL35tFNTROo2WsPVZhA77AJ5qJEx" connectionTimeOut = "1000" s3endPointLoc = "http://127.0.0.1:9000" sourceBucket = "asiatrip" spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", s3endPointLoc) spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", s3accessKeyAws) spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", s3secretKeyAws) spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", connectionTimeOut) spark.sparkContext._jsc.hadoopConfiguration().set("spark.sql.debug.maxToStringFields", "100") spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true") spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false") inputPath = f"s3a://{sourceBucket}/addresses.csv" outputPath = f"s3a://{sourceBucket}/output_survey.csv" df = spark.read.option("header", "true").format("s3selectCSV").csv(inputPath) df.write.mode("overwrite").parquet(outputPath) spark.stop() as
spark-submit miniospark.py it works fine for the addresses.csv file
a,b 1,2 3,4 6,7 8,9 in asiatrip bucket.
When I submit as
spark-submit --master spark://127.0.0.1:7077 miniospark.py with
s3endPointLoc = "http://my-minio-server:9000"
It gives up after some time because it cannot resolve my-minio-server.
2022-05-18 15:12:32,246 WARN streaming.FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: s3a://asiatrip/addresses.csv. org.apache.hadoop.fs.s3a.AWSClientIOException: getFileStatus on s3a://asiatrip/addresses.csv: com.amazonaws.SdkClientException: Unable to execute HTTP request: my-minio-server: nodename nor servname provided, or not known: Unable to execute HTTP request: my-minio-server: nodename nor servname provided, or not known I am on a Mac x64 with Docker Desktop