I'm trying to deploy a project I've done for a module in my master. In this project I have to modify some files to be able to use spark with some spark workers, cassandra, flask and some more packages.
The problem I'm having is while running my spark-submit container:
This is the docker-compose.yaml I'm using:
version: "3"
services:
zookeeper:
container_name: zookeeper
image: wurstmeister/zookeeper
ports:
- "2181:2181"
hostname: zookeeper
networks:
- red1
kafka:
container_name: kafka
image: wurstmeister/kafka:2.12-2.3.0
ports:
- "9092:9092"
depends_on:
- zookeeper
hostname: kafka
environment:
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
KAFKA_LISTENERS: PLAINTEXT://kafka:9092
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_CREATE_TOPICS: "flight_delay_classification_request:1:1"
networks:
- red1
mongo:
container_name: mongo
image: mongo:4.4.2
ports:
- "27017:27017"
depends_on:
- kafka
hostname: mongo
networks:
- red1
mongo_seed:
image: jlmendo11/mongo_seed
container_name: mongo_seed
depends_on:
- mongo
networks:
- red1
environment:
- MONGO_HOST=mongo
- MONGO_PORT=27017
command:
- "mongoimport --host mongo --port 27017 -d agile_data_science -c origin_dest_distances --mode upsert --type json --file /origin_dest_distances.jsonl"
restart: on-failure
spark-master:
image: bde2020/spark-master:3.3.0-hadoop3.3
container_name: spark-master
ports:
- "7077:7077"
- "9001:9001"
- "8080:8080"
environment:
- SPARK_LOCAL_IP=spark-master
- SPARK_WORKLOAD=master
volumes:
- ../models:/models
- ../flight_prediction/target/scala-2.12:/target/scala-2.12
networks:
- red1
spark-worker-1:
image: bde2020/spark-worker:3.3.0-hadoop3.3
container_name: spark-worker-1
depends_on:
- spark-master
ports:
- "8081:8081"
environment:
- SPARK_MASTER=spark://spark-master:7077
- SPARK_WORKLOAD=worker
- SPARK_LOCAL_IP=spark-worker-1
networks:
- red1
volumes:
- ../models:/models
- ../flight_prediction/target/scala-2.12:/target/scala-2.12
spark-worker-2:
image: bde2020/spark-worker:3.3.0-hadoop3.3
container_name: spark-worker-2
depends_on:
- spark-master
ports:
- "8082:8081"
environment:
- SPARK_MASTER=spark://spark-master:7077
- SPARK_WORKLOAD=worker
- SPARK_LOCAL_IP=spark-worker-2
networks:
- red1
volumes:
- ../models:/models
- ../flight_prediction/target/scala-2.12:/target/scala-2.12
spark-submit:
image: bde2020/spark-submit:3.3.0-hadoop3.3
container_name: spark-submit
depends_on:
- spark-master
- spark-worker-1
- spark-worker-2
ports:
- "4040:4040"
environment:
- SPARK_MASTER=spark://spark-master:7077
- SPARK_WORKLOAD=submitter
- SPARK_LOCAL_IP=spark-submit
- CLASSPATH=/scala-2.12
command: bash -c "sleep 15; /spark/bin/spark-submit --class "es.upm.dit.ging.predictor.MakePrediction" --master spark://spark-master:7077 --packages com.datastax.spark:spark-cassandra-connector_2.12:3.2.0, org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.4, com.datastax.cassandra:cassandra-driver-core:4.0.0 --jars scala-2.12/flight_prediction_2.12-0.1.jar"
# command: bash -c "sleep 15; /spark/bin/spark-submit --class "es.upm.dit.ging.predictor.MakePrediction" --master spark://spark-master:7077 --packages com.datastax.spark:spark-cassandra-connector_2.12:3.2.0, org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0, com.datastax.oss:java-driver-core_4.17.0 target/scala-2.12/flight_prediction_2.12-0.1.jar"
networks:
- red1
restart: on-failure
volumes:
- ../flight_prediction/target/scala-2.12:/scala-2.12
flask:
container_name: flask
image: jlmendo11/flask2
ports:
- "5000:5000"
- "9200:9200"
depends_on:
- mongo_seed
- spark-master
hostname: flask
networks:
- red1
restart: on-failure
cassandra:
container_name: cassandra
image: cassandra:3
ports:
- "9042:9042"
networks:
- red1
volumes:
- ./cassandra/init.cql:/scripts/init.cql
cassandra_init:
container_name: cassandra_init
image: nuvo/docker-cqlsh
depends_on:
- cassandra
networks:
- red1
volumes:
- ./cassandra/init.cql:/scripts/init.cql
restart: on-failure
networks:
red1:
driver : bridge
And this is my build.sbt file:
name := "flight_prediction"
version := "0.1"
scalaVersion := "2.12.10"
val sparkVersion = "3.3.4"
mainClass in Compile := Some("es.upm.dit.ging.predictor.MakePrediction")
resolvers ++= Seq(
"apache-snapshots" at "https://repository.apache.org/snapshots/"
)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion,
"org.apache.spark" %% "spark-streaming" % sparkVersion,
"org.apache.spark" %% "spark-hive" % sparkVersion,
"org.apache.spark" %% "spark-sql-kafka-0-10" % "3.3.4",
"org.mongodb.spark" %% "mongo-spark-connector" % "10.1.1",
"com.datastax.spark" %% "spark-cassandra-connector" % "3.2.0",
"com.datastax.cassandra" % "cassandra-driver-core" % "4.0.0"
)
I've tried many versions of spark because I've seen it could be some version error but I couldn't find the error because the error description is quite useless.

I solved this, apparently the problem were the gaps between packages and commas in the command field.
It was like this:
It should be like this: