Py Spark Samples
Py Spark Samples
com/pyspark/pyspark-select-first-row-of-each-group/
data = [("James","Sales",3000),("Michael","Sales",4600),
("Robert","Sales",4100),("Maria","Finance",3000),
("Raman","Finance",3000),("Scott","Finance",3300),
("Jen","Finance",3900),("Jeff","Marketing",3000),
("Kumar","Marketing",2000)]
df = spark.createDataFrame(data,["Name","Department","Salary"])
df.show()
pandas_df = df.toPandas()
pandas_df.to_excel("tmp.xlsx")
df.createOrReplaceTempView("EMP")
spark.sql("select Name, Department, Salary from "+
" (select *, row_number() OVER (PARTITION BY department ORDER BY salary DESC)
as rn " +
" FROM EMP) tmp where rn <= 1").show()
df = spark.read.json("1mb.json")
----------------------------------
# pyspark dataframe
rdd = spark.sparkContext.parallelize([
(1, 4., 'GFG1', date(2000, 8, 1), datetime(2000, 8, 1, 12, 0)),
(2, 8., 'GFG2', date(2000, 6, 2), datetime(2000, 6, 2, 12, 0)),
(3, 5., 'GFG3', date(2000, 5, 3), datetime(2000, 5, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df
# show table
df.show()
# show schema
df.printSchema()
---------------------------------------------------
# Need to import to use date time
from datetime import datetime, date
# show table
df.show()
# show schema
df.printSchema()
-------------------------------------------------------