Saturday, July 25, 2020

Azure Databricks - Spark Dataframe GROUPING & AGGREGATION cheat sheet for SQL developers

// SELECT Store, COUNT(1) FROM df GROUP BY Store
df.groupBy("Store").count.show(5)



// number of sales records per store
df.groupBy("Store").agg(count("Store").alias("NumOfRecords")).show(5)



// distinct number of departments per store
df.groupBy("Store").agg(countDistinct("Dept").alias("NumOfDepts")).show(5)



// average weekly sales per store
df.groupBy("Store").agg(avg("Weekly_Sales").alias("AvgWeeklySales")).show(5)



// months with highest sales
df.groupBy("Date").agg(sum("Weekly_Sales").alias("Weekly_Sales")).sort(desc("Weekly_Sales")).show(10)



// months with lowest sales
df.groupBy("Date").agg(sum("Weekly_Sales").alias("Weekly_Sales")).sort(asc("Weekly_Sales")).show(10)



// chain aggregates all at once - store wise minimum and maximum weekly sales (rounded to 2 decimal places)
val groupByColName = "Store"
val colName = "Weekly_Sales"

df.groupBy(groupByColName)
  .agg(min(colName),
       max(colName),
       round(avg(colName), 2))
  .show(5)



// join sales with stores dataframe
// inner join on "Store" column
df.join(dfStores, Seq("Store"), "inner").show(5)



// join sales with stores dataframe
// inner join on "Store" column
// count number of stores of each Type
// also get the average sales by each store Type
// also sort them by Type
df.join(dfStores, Seq("Store"), "inner").groupBy("Type").agg(countDistinct("Store").alias("NumberOfStores"), round(avg("Weekly_Sales"), 2).alias("AvgSalesByType")).orderBy(asc("Type")).show()

No comments:

Post a Comment