I built the jupyter/pyspark-notebook Docker image. I installed geomesa_pyspark and tried to run the following example commands from the official guide.
import geomesa_pyspark
conf = geomesa_pyspark.configure(
jars=['/usr/local/spark/jars/geomesa-accumulo-spark-runtime_2.11-2.0.0.jar'],
packages=['geomesa_pyspark','pytz'],
spark_home='/usr/local/spark/').\
setAppName('MyTestApp')
conf.get('spark.master')
from pyspark.sql import SparkSession
spark = ( SparkSession
.builder
.config(conf=conf)
.enableHiveSupport()
.getOrCreate()
)
The same problem persists.
Exception Traceback (most recent call last)
<ipython-input-4-eca73e557583> in <module>
22 from pyspark.sql import SparkSession
23
---> 24 spark = ( SparkSession
25 .builder
26 .config(conf=conf)
/usr/local/spark/python/pyspark/sql/session.py in getOrCreate(self)
226 sparkConf.set(key, value)
227 # This SparkContext may be an existing one.
--> 228 sc = SparkContext.getOrCreate(sparkConf)
229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
230 # by all sessions.
/usr/local/spark/python/pyspark/context.py in getOrCreate(cls, conf)
382 with SparkContext._lock:
383 if SparkContext._active_spark_context is None:
--> 384 SparkContext(conf=conf or SparkConf())
385 return SparkContext._active_spark_context
386
/usr/local/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
142 " is not allowed as it is a security risk.")
143
--> 144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
/usr/local/spark/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
329 with SparkContext._lock:
330 if not SparkContext._gateway:
--> 331 SparkContext._gateway = gateway or launch_gateway(conf)
332 SparkContext._jvm = SparkContext._gateway.jvm
333
/usr/local/spark/python/pyspark/java_gateway.py in launch_gateway(conf, popen_kwargs)
106
107 if not os.path.isfile(conn_info_file):
--> 108 raise Exception("Java gateway process exited before sending its port number")
109
110 with open(conn_info_file, "rb") as info:
Exception: Java gateway process exited before sending its port number
In the logs on Portainer instead I read this exception:
Exception in thread "main" org.apache.spark.SparkException: When running with master 'yarn' either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.
at org.apache.spark.deploy.SparkSubmitArguments.error(SparkSubmitArguments.scala:631)
at org.apache.spark.deploy.SparkSubmitArguments.validateSubmitArguments(SparkSubmitArguments.scala:271)
at org.apache.spark.deploy.SparkSubmitArguments.validateArguments(SparkSubmitArguments.scala:234)
at org.apache.spark.deploy.SparkSubmitArguments.<init>(SparkSubmitArguments.scala:119)
at org.apache.spark.deploy.SparkSubmit$$anon$2$$anon$3.<init>(SparkSubmit.scala:1013)
at org.apache.spark.deploy.SparkSubmit$$anon$2.parseArguments(SparkSubmit.scala:1013)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:85)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1030)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1039)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
I think the JAVA_HOME is set correctly (JAVA_HOME="/usr/lib/jvm/java-1.11.0-openjdk-amd64"). From the logs I guess that HADOOP_HOME should also be set, but despite the fact that the installed Hadoop version is shown in the Docker image specifications, when I run the container I can't find Hadoop anywhere. One way thing is that if I run the same commands from the pyspark shell instead of from the Notebook, it runs correctly without errors.
You don't say what version of Spark and GeoMesa you installed, but at a minimum you will need to:
jarspath in you code is pointing to the correct GeoMesa jar for the version you installed