from pyspark.sql.column import Column, _to_java_column
from pyspark.sql.types import _parse_datatype_json_string
def ext_from_xml(xml_column, schema, options={}):
java_column = _to_java_column(xml_column.cast('string'))
java_schema = spark._jsparkSession.parseDataType(schema.json())
scala_map = spark._jvm.org.apache.spark.api.python.PythonUtils.toScalaMap(options)
jc = spark._jvm.com.databricks.spark.xml.functions.from_xml(
java_column, java_schema, scala_map)
return Column(jc)
def ext_schema_of_xml_df(df, options={}):
assert len(df.columns) == 1
scala_options = spark._jvm.PythonUtils.toScalaMap(options)
java_xml_module = getattr(getattr(
spark._jvm.com.databricks.spark.xml, "package$"), "MODULE$")
java_schema = java_xml_module.schema_of_xml_df(df._jdf, scala_options)
return _parse_datatype_json_string(java_schema.json())
df = spark.read.format('delta').table("tablename")
payloadSchema = ext_schema_of_xml_df(df.select("Columnname"))
On doing so getting the below error:-
java.lang.NoClassDefFoundError: Could not initialize class com.databricks.spark.xml.util.PermissiveMode$
Kindly note:- Maven Coordinate Used is com.databricks:spark-xml_2.12:0.14.0.
Expecting an compatible version of com.databricks:spark-xml with Data bricks Runtime 10.4 LTS (includes Apache Spark 3.2.1, Scala 2.12)