0

I am trying to create an external file in hive metastore, using apache hudi framework. Its able to connect with hive metastore but throws exception after the connection when trying to create table.


dataFrame.writeStream
      .format("org.apache.hudi")
      .option(HoodieWriteConfig.TABLE_NAME, tableName)
      .option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY,tableName)
      .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
      .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
      .option(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE_OPT_KEY, "true")
      .option(DataSourceWriteOptions.DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL, "false")
           .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partition_id")
            .option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "partition_id")
            .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://localhost:10000")
      .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, key)
      .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, combineKey)
      .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
      .option("checkpointLocation", "/tmp/test/checkpoint")
      .option("spark.kryo.registrationRequired", "true")
      .option("hoodie.upsert.shuffle.parallelism", "1")
      .outputMode("append")
      .start("s3a://testbucket/test")

dependencies:

scalaVersion := "2.12.1"

libraryDependencies += "org.apache.spark" %% "spark-core" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "3.1.1" % "provided"
libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.1.1"
libraryDependencies += "org.apache.hudi" %% "hudi-spark-bundle" % "0.7.0"
libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.1.4"
libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % "3.1.1"
libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "3.1.1"
libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-hive" % "3.1.1"
libraryDependencies += "org.apache.hive" % "hive-jdbc" % "3.1.1"
libraryDependencies += "org.apache.hive" % "hive-metastore" % "3.1.1"
libraryDependencies += "org.apache.hive" % "hive-exec" % "3.1.1"
dependencyOverrides += "org.apache.hadoop" % "hadoop-common" % "3.1.1"
dependencyOverrides += "org.apache.commons" % "commons-lang3" % "3.9"

got following exception:

org.apache.hudi.hive.HoodieHiveSyncException: Failed in executing SQL CREATE EXTERNAL TABLE  IF NOT EXISTS `default`.......' org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hudi.hadoop.HoodieParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 's3a://testbucket/test'

Caused by: org.apache.hive.service.cli.HiveSQLException: Error running query: java.lang.NoClassDefFoundError: org/apache/hadoop/fs/StreamCapabilities
    at org.apache.hive.jdbc.Utils.verifySuccess(Utils.java:300) ~[hive-jdbc-3.1.1.jar:3.1.1]
    at org.apache.hive.jdbc.Utils.verifySuccessWithInfo(Utils.java:286) ~[hive-jdbc-3.1.1.jar:3.1.1]
    at org.apache.hive.jdbc.HiveStatement.runAsyncOnServer(HiveStatement.java:324) ~[hive-jdbc-3.1.1.jar:3.1.1]
    at org.apache.hive.jdbc.HiveStatement.execute(HiveStatement.java:265) ~[hive-jdbc-3.1.1.jar:3.1.1]
    at org.apache.hudi.hive.HoodieHiveClient.updateHiveSQL(HoodieHiveClient.java:367) ~[hudi-spark-bundle_2.12-0.7.0.jar:0.7.0]
    ... 37 more
sreekesh.s
  • 158
  • 1
  • 8

1 Answers1

0

seems like there is a jar version mismatch? you would open a hudi github issue to get timely response from the community.

sf lee
  • 1