payments = [('15-02-2022',100),('16-02-2022',500),('17-02-2022',900),('18-02-2022',300),('19-02-
df_payments = spark.createDataFrame(payments,['Date','Payments'])
df_payments = df_payments.withColumn('Date',to_date(col('Date'),'dd-MM-yyyy'))
windowSpec = Window.orderBy(col('Date'))
df_payments = df_payments.withColumn('Lag_Payments',lag("Payments",1).over(windowSpec)) \
.withColumn('variance_flag',when(col('Lag_Payments') == "null","null").when(col('Lag_Payments') > col('
Payments'), -1)\
.when(col('Lag_Payments') < col('Payments'), 1)).drop('Lag_Payments')
| Date|Payments|variance_flag|
|2022-02-15| 100| null|
|2022-02-16| 500| 1|
|2022-02-17| 900| 1|
|2022-02-18| 300| -1|
|2022-02-19| 400| 1|
|2022-02-20| 120| -1|
|2022-02-21| 1000| 1|
with mytable as (select *, lag(Payments,1) over (order by date) as temp_pay from t1)
select date, Payments,case
when temp_pay is Null then Null
when Payments>temp_pay then 1 else -1 end as variance_flag from mytable;
Using "lag" window function on "payments" column we can achieve this output.
select * , case when payment>lag(payment) over(order by date desc) then 1 else -1 end variance_flag
from payment
Output :
lst_data = [("India","Aus","India"),("Srilanka","Aus","Aus"),("Srilanka","India","India")]
schema = ["Team1","Team2","Winner"]
df = spark.createDataFrame(lst_data,schema)
df1 = df.groupBy("team1").count().withColumnRenamed("team1","team")
df2 = df.groupBy("team2").count().withColumnRenamed("team2","team")
df3 =
df4 = df.groupBy("Winner").count().withColumnRenamed("count","total_win")
df5 = df3.join(df4,df3.team ==
total_win").withColumn("total_loss",(col("total_match") - col("total_win")))
val a=List(("India","Aus","India"),("Japan","Aus","Aus"),("Japan","India","India"))
val df=a.toDF("team1","team2","win")
|team1|team2| win|
|India| Aus|India|
|Japan| Aus| Aus|
val df2=df.select("team1").union(df.select("team2"))
val df3=df2.groupBy("team1").count().withColumnRenamed("count","Total_Matches")
|India| 2|
| Aus| 2|
|Japan| 2|
val df4=df.groupBy("win").count().withColumnRenamed("count","winner")
| win|winner|
|India| 2|
| Aus| 1|
|India| 2| 2| 0|
| Aus| 2| 1| 1|
|Japan| 2| 0| 2|
Hope the below query will give the solution if you use spark SQL:
With teams_cte as
(Select team1 as team, (case when team1=winner then 1 else 0 end) as won from table
Union all
Select team2 as team, (case when team2=winner then 1 else 0 end) as won from table)
Select team, count(*) as tot_natches, sum(won) as total_won , count(*)-sum(won) as total_loss
From teams_cte
Group by team;
This is a very good question. I was asked this question in one of my interviews but in SQL. Just attempted
to solve this using pyspark.