I'm using the approach given here to flatten a DataFrame in Spark SQL. Here is my code:
package com.acme.etl.xml import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, SparkSession} object RuntimeError { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("FlattenSchema").getOrCreate() val rowTag = "idocData" val dataFrameReader = spark.read .option("rowTag", rowTag) val xmlUri = "bad_011_1.xml" val df = dataFrameReader .format("xml") .load(xmlUri) val schema: StructType = df.schema val columns: Array[Column] = flattenSchema(schema) val df2 = df.select(columns: _*) } def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = { schema.fields.flatMap(f => { val colName: String = if (prefix == null) f.name else prefix + "." + f.name val dataType = f.dataType dataType match { case st: StructType => flattenSchema(st, colName) case _: StringType => Array(new org.apache.spark.sql.Column(colName)) case _: LongType => Array(new org.apache.spark.sql.Column(colName)) case _: DoubleType => Array(new org.apache.spark.sql.Column(colName)) case arrayType: ArrayType => arrayType.elementType match { case structType: StructType => flattenSchema(structType, colName) } case _ => Array(new org.apache.spark.sql.Column(colName)) } }) } } Much of the time, this works fine. But for the XML given below:
<Receive xmlns="http://Microsoft.LobServices.Sap/2007/03/Idoc/3/ORDERS05/ZORDERS5/702/Receive"> <idocData> <E2EDP01008GRP xmlns="http://Microsoft.LobServices.Sap/2007/03/Types/Idoc/3/ORDERS05/ZORDERS5/702"> <E2EDPT1001GRP> <E2EDPT2001> <DATAHEADERCOLUMN_DOCNUM>0000000141036013</DATAHEADERCOLUMN_DOCNUM> </E2EDPT2001> <E2EDPT2001> <DATAHEADERCOLUMN_DOCNUM>0000000141036013</DATAHEADERCOLUMN_DOCNUM> </E2EDPT2001> </E2EDPT1001GRP> </E2EDP01008GRP> <E2EDP01008GRP xmlns="http://Microsoft.LobServices.Sap/2007/03/Types/Idoc/3/ORDERS05/ZORDERS5/702"> </E2EDP01008GRP> </idocData> </Receive> this exception occurs:
Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve '`E2EDP01008GRP`.`E2EDPT1001GRP`.`E2EDPT2001`['DATAHEADERCOLUMN_DOCNUM']' due to data type mismatch: argument 2 requires integral type, however, ''DATAHEADERCOLUMN_DOCNUM'' is of string type.;; 'Project [E2EDP01008GRP#0.E2EDPT1001GRP.E2EDPT2001[DATAHEADERCOLUMN_DOCNUM] AS DATAHEADERCOLUMN_DOCNUM#3, E2EDP01008GRP#0._VALUE AS _VALUE#4, E2EDP01008GRP#0._xmlns AS _xmlns#5] +- Relation[E2EDP01008GRP#0] XmlRelation(<function0>,Some(/Users/paulreiners/s3/cdi-events-partition-staging/content_acme_purchase_order_json_v1/bad_011_1.xml),Map(rowtag -> idocData, path -> /Users/paulreiners/s3/cdi-events-partition-staging/content_acme_purchase_order_json_v1/bad_011_1.xml),null) What is causing this?