from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import year
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
emp = [(1,"Smith",-1,"2018","10","M",3000), \
(2,"Rose",1,"2010","20","M",4000), \
(3,"Williams",1,"2010","10","M",1000), \
(4,"Jones",2,"2005","10","F",2000), \
(5,"Brown",2,"2010","40","",-1), \
(6,"Brown",2,"2010","50","",-1) \
]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
"emp_dept_id","gender","salary"]
empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
ZnJvbSBweXNwYXJrIGltcG9ydCAgU3BhcmtDb250ZXh0CmZyb20gcHlzcGFyay5zcWwgaW1wb3J0IFNRTENvbnRleHQKZnJvbSBweXNwYXJrLnNxbC5mdW5jdGlvbnMgaW1wb3J0IHllYXIKCmZyb20gcHlzcGFyayBpbXBvcnQgU3BhcmtDb25mLCBTcGFya0NvbnRleHQKZnJvbSBweXNwYXJrLnNxbCBpbXBvcnQgSGl2ZUNvbnRleHQKCgoKZW1wID0gWygxLCJTbWl0aCIsLTEsIjIwMTgiLCIxMCIsIk0iLDMwMDApLCBcCiAgICAoMiwiUm9zZSIsMSwiMjAxMCIsIjIwIiwiTSIsNDAwMCksIFwKICAgICgzLCJXaWxsaWFtcyIsMSwiMjAxMCIsIjEwIiwiTSIsMTAwMCksIFwKICAgICg0LCJKb25lcyIsMiwiMjAwNSIsIjEwIiwiRiIsMjAwMCksIFwKICAgICg1LCJCcm93biIsMiwiMjAxMCIsIjQwIiwiIiwtMSksIFwKICAgICAgKDYsIkJyb3duIiwyLCIyMDEwIiwiNTAiLCIiLC0xKSBcCiAgXQplbXBDb2x1bW5zID0gWyJlbXBfaWQiLCJuYW1lIiwic3VwZXJpb3JfZW1wX2lkIiwieWVhcl9qb2luZWQiLCBcCiAgICAgICAiZW1wX2RlcHRfaWQiLCJnZW5kZXIiLCJzYWxhcnkiXQoKZW1wREYgPSBzcGFyay5jcmVhdGVEYXRhRnJhbWUoZGF0YT1lbXAsIHNjaGVtYSA9IGVtcENvbHVtbnMpCmVtcERGLnByaW50U2NoZW1hKCkKZW1wREYuc2hvdyh0cnVuY2F0ZT1GYWxzZSkKCmRlcHQgPSBbKCJGaW5hbmNlIiwxMCksIFwKICAgICgiTWFya2V0aW5nIiwyMCksIFwKICAgICgiU2FsZXMiLDMwKSwgXAogICAgKCJJVCIsNDApIFwKICBdCmRlcHRDb2x1bW5zID0gWyJkZXB0X25hbWUiLCJkZXB0X2lkIl0KZGVwdERGID0gc3BhcmsuY3JlYXRlRGF0YUZyYW1lKGRhdGE9ZGVwdCwgc2NoZW1hID0gZGVwdENvbHVtbnMpCmRlcHRERi5wcmludFNjaGVtYSgpCg==