// SELECT Store, COUNT(1) FROM df GROUP BY Store
df.groupBy("Store").count.show(5)
// number of sales records per store
df.groupBy("Store").agg(count("Store").alias("NumOfRecords")).show(5)
// distinct number of departments per store
df.groupBy("Store").agg(countDistinct("Dept").alias("NumOfDepts")).show(5)
// average weekly sales per store
df.groupBy("Store").agg(avg("Weekly_Sales").alias("AvgWeeklySales")).show(5)
// months with highest sales
df.groupBy("Date").agg(sum("Weekly_Sales").alias("Weekly_Sales")).sort(desc("Weekly_Sales")).show(10)
// months with lowest sales
df.groupBy("Date").agg(sum("Weekly_Sales").alias("Weekly_Sales")).sort(asc("Weekly_Sales")).show(10)
// chain aggregates all at once - store wise minimum and maximum weekly sales (rounded to 2 decimal places)
val groupByColName = "Store"
val colName = "Weekly_Sales"
df.groupBy(groupByColName)
.agg(min(colName),
max(colName),
round(avg(colName), 2))
.show(5)
// join sales with stores dataframe
// inner join on "Store" column
df.join(dfStores, Seq("Store"), "inner").show(5)
// join sales with stores dataframe
// inner join on "Store" column
// count number of stores of each Type
// also get the average sales by each store Type
// also sort them by Type
df.join(dfStores, Seq("Store"), "inner").groupBy("Type").agg(countDistinct("Store").alias("NumberOfStores"), round(avg("Weekly_Sales"), 2).alias("AvgSalesByType")).orderBy(asc("Type")).show()
No comments:
Post a Comment