Nats Jetstream connector for spark (custom jar) example doesn't work

34 Views Asked by At

I am new to the Apache Spark environment and am trying to create a test setup. Namely, I am using a NATS Jetstream server as a message broker for my IoT messages. I found online this github repo (from NATS itself) with a Scala connector for Spark. (repo: https://github.com/nats-io/nats-spark-connector/tree/main/load_balanced) According to the readme, the connector could be used as follows:

val spark = SparkSession
  .builder()
  .master("spark://localhost:7077")
  .appName("NatsReaderTest")
  .config("spark.logConf", "false")
  .config("spark.jars",
  "/some_path_to_the_connector_jar/nats-spark-connector_2.12-0.1.jar,"
  + "/some_path_to_the_Nats_jar/jnats-2.13.2.jar"
  )
  .config("spark.executor.instances", "2")
  .config("spark.cores.max", "4")
  .config("spark.executor.memory", "2g")
  .getOrCreate()
val initDF = spark
    .readStream
    .format("nats")
    .option("nats.host", host)
    .option("nats.port", port)
    .option("nats.stream.name", "TestStream")
    .option("nats.stream.subjects", "subject1, subject2")
    // wait 90 seconds for an ack before resending a message
    .option("nats.msg.ack.wait.secs", 90)
    .option("nats.num.listeners", 2)
    // Each listener will fetch 10 messages at a time
    .option("nats.msg.fetch.batch.size", 10)
    .load()

However, I would like to use PySpark since my colleagues don't know java or scala. Is a similar way possible? Can anyone help me with a simple test example?

(Or does anyone have a good idea how to use PySpark with NATS?)

I am using a Spark-jupyter container in docker as environment.

I have tried something very similar, but get stuck on errors

spark = SparkSession \
    .builder \
    .master("spark://localhost:4040") \
    .appName("Nats Jetstream Concept") \
    .config("spark.logConf", "true") \
    .config("spark.jars", 
            "./Nats_Spark_Connector_Scala/nats-spark-connector/load_balanced/target/scala-2.12/nats-spark-connector-balanced-assembly-1.2.2.jar," 
            +"./Nats_Spark_Connector_Scala/jnats-2.13.2.jar") \
    .config("spark.executor.instances", "1") \
    .config("spark.executor.memory", "1g") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

initDF = spark.readStream.format("nats") \
    .option("nats.host", "localhost") \
    .option("nats.port", "4222") \
    .option("nats.stream.name", "CalculationStream") \
    .option("nats.stream.subjects", "calc.calcId123") \
    .option("nats.msg.ack.wait.secs", 90) \
    .option("nats.num.listeners", 1) \
    .option("nats.msg.fetch.batch.size", 1) \
    .load()
initDF.writeStream \
    .format("console") \
    .start() \
    .awaitTermination()

Py4JJavaError                             Traceback (most recent call last)
Cell In[39], line 10
      1 # initDF.writeStream \
      2 #     .format("nats") \
      3 #     .option("checkpointLocation", "Users/spark_checkpoint") \
      4 #     .option("nats.host", "0.0.0.0") \
      5 #     .option("nats.port", "4222") \
      6 #     .start
      8 initDF.writeStream \
      9     .format("console") \
---> 10     .start() \
     11     .awaitTermination()

File /opt/conda/envs/vintecc/lib/python3.11/site-packages/pyspark/sql/streaming/readwriter.py:1527, in DataStreamWriter.start(self, path, format, outputMode, partitionBy, queryName, **options)
   1525     self.queryName(queryName)
   1526 if path is None:
-> 1527     return self._sq(self._jwrite.start())
   1528 else:
   1529     return self._sq(self._jwrite.start(path))

File /opt/conda/envs/vintecc/lib/python3.11/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
   1316 command = proto.CALL_COMMAND_NAME +\
   1317     self.command_header +\
   1318     args_command +\
   1319     proto.END_COMMAND_PART
   1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
   1323     answer, self.gateway_client, self.target_id, self.name)
   1325 for temp_arg in temp_args:
   1326     if hasattr(temp_arg, "_detach"):

File /opt/conda/envs/vintecc/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py:179, in capture_sql_exception.<locals>.deco(*a, **kw)
    177 def deco(*a: Any, **kw: Any) -> Any:
    178     try:
--> 179         return f(*a, **kw)
    180     except Py4JJavaError as e:
    181         converted = convert_exception(e.java_exception)

File /opt/conda/envs/vintecc/lib/python3.11/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
    324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325 if answer[1] == REFERENCE_TYPE:
--> 326     raise Py4JJavaError(
    327         "An error occurred while calling {0}{1}{2}.\n".
    328         format(target_id, ".", name), value)
    329 else:
    330     raise Py4JError(
    331         "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
    332         format(target_id, ".", name, value))

Py4JJavaError: An error occurred while calling o332.start.
: java.lang.IllegalStateException: RpcEnv has been stopped
    at org.apache.spark.rpc.netty.Dispatcher.registerRpcEndpoint(Dispatcher.scala:60)
    at org.apache.spark.rpc.netty.NettyRpcEnv.setupEndpoint(NettyRpcEnv.scala:136)
    at org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef$.forDriver(StateStoreCoordinator.scala:72)
    at org.apache.spark.sql.streaming.StreamingQueryManager.<init>(StreamingQueryManager.scala:52)
    at org.apache.spark.sql.internal.BaseSessionStateBuilder.streamingQueryManager(BaseSessionStateBuilder.scala:339)
    at org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$build$4(BaseSessionStateBuilder.scala:377)
    at org.apache.spark.sql.internal.SessionState.streamingQueryManager$lzycompute(SessionState.scala:100)
    at org.apache.spark.sql.internal.SessionState.streamingQueryManager(SessionState.scala:100)
    at org.apache.spark.sql.streaming.DataStreamWriter.startQuery(DataStreamWriter.scala:422)
    at org.apache.spark.sql.streaming.DataStreamWriter.startInternal(DataStreamWriter.scala:410)
    at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:251)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:568)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
    at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
    at java.base/java.lang.Thread.run(Thread.java:833)
0

There are 0 best solutions below