Hello,
While trying to run TitanicTrainingData notebook using spark engine; I’m getting ClassNotFoundException HiveException while trying to insert into the feature-group.
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf_spark = SparkConf()
conf_spark.set("spark.driver.host", "127.0.0.1")
conf_spark.set("spark.hadoop.hops.ssl.keystores.passwd.name", "material_passwd")
conf_spark.set("spark.hadoop.fs.hopsfs.impl", "io.hops.hopsfs.client.HopsFileSystem")
conf_spark.set("spark.hadoop.hops.ipc.server.ssl.enabled", "true")
conf_spark.set("spark.hadoop.hops.ssl.hostname.verifier", "ALLOW_ALL")
conf_spark.set("spark.hadoop.hops.rpc.socket.factory.class.default", "io.hops.hadoop.shaded.org.apache.hadoop.net.HopsSSLSocketFactory")
conf_spark.set("spark.hadoop.client.rpc.ssl.enabled.protocol", "TLSv1.2")
conf_spark.set("spark.hadoop.hops.ssl.keystores.passwd.name", "<path-to-pw>/material_passwd")
conf_spark.set("spark.hadoop.hops.ssl.keystore.name", "<path-to-store>/keyStore.jks")
conf_spark.set("spark.hadoop.hops.ssl.trustore.name", "<path-to-store>/trustStore.jks")
conf_spark.set("spark.sql.hive.metastore.jars", "<path-to-jars>/hopswork_jars")
conf_spark.set("spark.hadoop.hive.metastore.uris", "thrift://IP:9085")
# sc = SparkContext.getOrCreate(conf_spark)
spark_session = SparkSession \
.builder \
.enableHiveSupport() \
.config(conf=conf_spark) \
.getOrCreate()
connection = hsfs.connection(host=IP,
port=443,
engine='spark',
project = PROJECT_NAME,
api_key_value = API_KEY)
fs = connection.get_feature_store()
raw_df = spark_session.read.csv(training_csv_path, header=True)
clean_train_df = raw_df.select('survived', 'pclass', 'sex', 'fare', 'age', 'sibsp', 'parch') \
.fillna({'age': 30})
titantic_fg = fs.create_feature_group(name="titanic_training_all_features",
version=1,
description="titanic training dataset with clean features",
time_travel_format=None,
)
titantic_fg.insert(clean_train_df)
The above code runs into exception while saving dataframe into feature-group.
Py4JJavaError: An error occurred while calling o827.saveAsTable.
: java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveException
at java.base/java.lang.Class.getDeclaredConstructors0(Native Method)
at java.base/java.lang.Class.privateGetDeclaredConstructors(Class.java:3137)
at java.base/java.lang.Class.getConstructors(Class.java:1943)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:291)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:492)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:352)
at org.apache.spark.sql.hive.HiveExternalCatalog.client$lzycompute(HiveExternalCatalog.scala:71)
at org.apache.spark.sql.hive.HiveExternalCatalog.client(HiveExternalCatalog.scala:70)
at org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$databaseExists$1(HiveExternalCatalog.scala:224)
at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:102)
at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:224)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:134)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:124)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.externalCatalog(HiveSessionStateBuilder.scala:44)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.$anonfun$catalog$1(HiveSessionStateBuilder.scala:59)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:98)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:98)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.tableExists(SessionCatalog.scala:462)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:692)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:626)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveException
at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:588)
at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.doLoadClass(IsolatedClientLoader.scala:247)
at org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1.loadClass(IsolatedClientLoader.scala:236)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:521)
... 32 more
And the read from feature group also fails:
tititanic_df = titantic_fg.read()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/var/folders/dd/2rm5s2w54zs70s3d5mwsn1kh0000gq/T/ipykernel_57495/2691049954.py in <module>
----> 1 a = titantic_fg.read()
~/opt/anaconda3/envs/py38misc/lib/python3.8/site-packages/hsfs/feature_group.py in read(self, wallclock_time, online, dataframe_type, read_options)
571 `RestAPIError`. No data is available for feature group with this commit date, If time travel enabled.
572 """
--> 573 engine.get_instance().set_job_group(
574 "Fetching Feature group",
575 "Getting feature group: {} from the featurestore {}".format(
~/opt/anaconda3/envs/py38misc/lib/python3.8/site-packages/hsfs/engine/spark.py in set_job_group(self, group_id, description)
76
77 def set_job_group(self, group_id, description):
---> 78 self._spark_session.sparkContext.setJobGroup(group_id, description)
79
80 def register_on_demand_temporary_table(self, on_demand_fg, alias):
~/opt/anaconda3/envs/py38misc/lib/python3.8/site-packages/pyspark/context.py in setJobGroup(self, groupId, description, interruptOnCancel)
1140 Cancelled
1141 """
-> 1142 self._jsc.setJobGroup(groupId, description, interruptOnCancel)
1143
1144 def setLocalProperty(self, key, value):
AttributeError: 'NoneType' object has no attribute 'setJobGroup'
Versions:
- Python - 3.8
- pyspark - 3.1.2
- hsfs[hive] - 2.3.2
- hopsworks server install - 2.3.0-SNAPSHOT
- hive2 - 3.0.0.8-SNAPSHOT