The input text file is shown below.
The SBT library dependencies are shown below for reference.
scalaVersion := "2.11.12"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.0"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.3.0"
The Scala program is provided below.
The converted ORC file is shown below.import org.apache.spark.sql.{SaveMode, SparkSession} object DelimitedToORCConverter { def main(args: Array[String]): Unit = { val inputDelimitedFile = "C:\\data\\delimited.txt"val outputORCFile = "C:\\data\\out_data_delimited"val delimiter = "|" ConvertFile(inputDelimitedFile, outputORCFile, delimiter) //Pipe character } def ConvertFile(inputDelimitedFile: String, outputORCFile: String, delimiter: String) { val spark = SparkSession.builder() .master("local") .appName("DelimitedToORConverter") .config("spark.sql.orc.impl", "native") .getOrCreate() val df = spark .read .format("csv") .option("delimiter", delimiter) //TAB delimited file .option("header", "true") .load(inputDelimitedFile) df .write .mode(SaveMode.Overwrite) .option("header", "true") .orc(outputORCFile) } }
That's all!
No comments:
Post a Comment