Pyspark Notes
Pyspark Notes
json
[{
"RecordNumber": 2,
"Zipcode": 704,
"ZipCodeType": "STANDARD",
"City": "PASEO COSTA DEL SUR",
"State": "PR"
},
{
"RecordNumber": 10,
"Zipcode": 709,
"ZipCodeType": "STANDARD",
"City": "BDA SAN LUIS",
"State": "PR"
}]
resources/simple-zipcodes.csv
RecordNumber,Country,City,Zipcode,State
1,US,PARC PARQUE,704,PR
2,US,PASEO COSTA DEL SUR,704,PR
10,US,BDA SAN LUIS,709,PR
49347,US,HOLT,32564,FL
49348,US,HOMOSASSA,34487,FL
61391,US,CINGULAR WIRELESS,76166,TX
61392,US,FORT WORTH,76177,TX
61393,US,FT WORTH,76177,TX
54356,US,SPRUCE PINE,35585,AL
76511,US,ASH HILL,27007,NC
4,US,URB EUGENE RICE,704,PR
39827,US,MESA,85209,AZ
39828,US,MESA,85210,AZ
49345,US,HILLIARD,32046,FL
49346,US,HOLDER,34445,FL
3,US,SECT LANAUSSE,704,PR
54354,US,SPRING GARDEN,36275,AL
54355,US,SPRINGVILLE,35146,AL
76512,US,ASHEBORO,27203,NC
76513,US,ASHEBORO,27204,NC
resources/small_zipcode.csv
id,zipcode,type,city,state,population
1,704,STANDARD,,PR,30100
2,704,,PASEO COSTA DEL SUR,PR,
3,709,,BDA SAN LUIS,PR,3700
4,76166,UNIQUE,CINGULAR WIRELESS,TX,84000
5,76177,STANDARD,,TX,
resources/test.txt
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
//convert-column-python-list.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \
("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \
]
columns=["firstname","lastname","country","state"]
df=spark.createDataFrame(data=data,schema=columns)
df.show()
print(df.collect())
states1=df.rdd.map(lambda x: x[3]).collect()
print(states1)
#['CA', 'NY', 'CA', 'FL']
from collections import OrderedDict
res = list(OrderedDict.fromkeys(states1))
print(res)
#['CA', 'NY', 'FL']
#Example 2
states2=df.rdd.map(lambda x: x.state).collect()
print(states2)
#['CA', 'NY', 'CA', 'FL']
states3=df.select(df.state).collect()
print(states3)
#[Row(state='CA'), Row(state='NY'), Row(state='CA'), Row(state='FL')]
states4=df.select(df.state).rdd.flatMap(lambda x: x).collect()
print(states4)
#['CA', 'NY', 'CA', 'FL']
states5=df.select(df.state).toPandas()['state']
states6=list(states5)
print(states6)
#['CA', 'NY', 'CA', 'FL']
pandDF=df.select(df.state,df.firstname).toPandas()
print(list(pandDF['state']))
print(list(pandDF['firstname']))
currentdate.py
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 24 22:42:50 2019
@author: prabha
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
schema = StructType([
StructField("seq", StringType(), True)])
dates = ['1']
df = spark.createDataFrame(list('1'), schema=schema)
df.show()
pandas-pyspark-dataframe.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pandas as pd
data = [['Scott', 50], ['Jeff', 45], ['Thomas', 54],['Ann',34]]
# print dataframe.
print(pandasDF)
spark = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExamples.com") \
.getOrCreate()
sparkDF=spark.createDataFrame(pandasDF)
sparkDF.printSchema()
sparkDF.show()
#sparkDF=spark.createDataFrame(pandasDF.astype(str))
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
mySchema = StructType([ StructField("First Name", StringType(), True)\
,StructField("Age", IntegerType(), True)])
sparkDF2 = spark.createDataFrame(pandasDF,schema=mySchema)
sparkDF2.printSchema()
sparkDF2.show()
spark.conf.set("spark.sql.execution.arrow.enabled","true")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled","true")
pandasDF2=sparkDF2.select("*").toPandas
print(pandasDF2)
test=spark.conf.get("spark.sql.execution.arrow.enabled")
print(test)
test123=spark.conf.get("spark.sql.execution.arrow.pyspark.fallback.enabled")
print(test123)
pyspark-add-month.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [('James','Smith','M',3000),
('Anna','Rose','F',4100),
('Robert','Williams','M',6200),
]
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
pyspark-aggregate.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
@author: prabha
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct,collect_list
from pyspark.sql.functions import collect_set,sum,avg,max,countDistinct,count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance,var_samp, var_pop
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
print("approx_count_distinct: " + \
str(df.select(approx_count_distinct("salary")).collect()[0][0]))
df.select(collect_list("salary")).show(truncate=False)
df.select(collect_set("salary")).show(truncate=False)
print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
.show(truncate=False)
pyspark-array-string.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
("Robert,,Williams",["CSharp","VB"],"NV")]
df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)
df.createOrReplaceTempView("ARRAY_STRING")
spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool,currentState from
ARRAY_STRING").show(truncate=False)
pyspark-arraytype.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
arrayCol = ArrayType(StringType(),False)
data = [
("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]
schema = StructType([
StructField("name",StringType(),True),
StructField("languagesAtSchool",ArrayType(StringType()),True),
StructField("languagesAtWork",ArrayType(StringType()),True),
StructField("currentState", StringType(), True),
StructField("previousState", StringType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","Smith","USA","CA"),
("Michael","Rose","USA","NY"),
("Robert","Williams","USA","CA"),
("Maria","Jones","USA","FL")
]
columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)
def state_convert(code):
return broadcastStates.value[code]
filteDf= df.where((df['state'].isin(broadcastStates.value)))
pyspark-cast-column.py
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 13 21:08:30 2020
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"06-01-1992","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
df3.createOrReplaceTempView("CastExample")
df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from
CastExample")
df4.printSchema()
df4.show(truncate=False)
pyspark-change-string-double.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
simpleData = [("James","34","true","M","3000.6089"),
("Michael","33","true","F","3300.8067"),
("Robert","37","false","M","5000.5034")
]
columns = ["firstname","age","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
#df.withColumn("salary",round(df.salary.cast(DoubleType()),2)).show(truncate=False).printSchema(
)
df.selectExpr("firstname","isGraduated","cast(salary as double) salary").printSchema()
df.createOrReplaceTempView("CastExample")
spark.sql("SELECT firstname,isGraduated,DOUBLE(salary) as salary from
CastExample").printSchema()
#df.select("firstname",expr(df.age),"isGraduated",col("salary").cast('float').alias("salary")).show()
pyspark-collect.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)
dataCollect = deptDF.collect()
print(dataCollect)
dataCollect2 = deptDF.select("dept_name").collect()
print(dataCollect2)
data=[("James","Bond","100",None),
("Ann","Varsa","200",'F'),
("Tom Cruise","XXX","400",''),
("Tom Brand",None,"400",'M')]
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)
#alias
from pyspark.sql.functions import expr
df.select(df.fname.alias("first_name"), \
df.lname.alias("last_name"), \
expr(" fname ||','|| lname").alias("fullName") \
).show()
#asc, desc
df.sort(df.fname.asc()).show()
df.sort(df.fname.desc()).show()
#cast
df.select(df.fname,df.id.cast("int")).printSchema()
#between
df.filter(df.id.between(100,300)).show()
#contains
df.filter(df.fname.contains("Cruise")).show()
#startswith, endswith()
df.filter(df.fname.startswith("T")).show()
df.filter(df.fname.endswith("Cruise")).show()
#eqNullSafe
#like , rlike
df.select(df.fname,df.lname,df.id) \
.filter(df.fname.like("%om"))
#over
#substr
df.select(df.fname.substr(1,2).alias("substr")).show()
#isin
li=["100","200"]
df.select(df.fname,df.lname,df.id) \
.filter(df.id.isin(li)) \
.show()
schema = StructType([
StructField('name', StructType([
StructField('fname', StringType(), True),
StructField('lname', StringType(), True)])),
StructField('languages', ArrayType(StringType()),True),
StructField('properties', MapType(StringType(),StringType()),True)
])
df=spark.createDataFrame(data,schema)
df.printSchema()
#getItem()
df.select(df.languages.getItem(1)).show()
df.select(df.properties.getItem("hair")).show()
df.select(df.name.getField("fname")).show()
#dropFields
#from pyspark.sql.functions import col
#df.withColumn("name1",col("name").dropFields(["fname"])).show()
#withField
#from pyspark.sql.functions import lit
#df.withColumn("name",df.name.withField("fname",lit("AA"))).show()
data=[("James",23),("Ann",40)]
df=spark.createDataFrame(data).toDF("name.fname","gender")
df.printSchema()
df.show()
df.select(df.prop.hair).show()
df.select(df["prop.hair"]).show()
df.select(col("prop.hair")).show()
df.select(col("prop.*")).show()
# Column operators
data=[(100,2,1),(200,3,4),(300,4,4)]
df=spark.createDataFrame(data).toDF("col1","col2","col3")
df.select(df.col1 + df.col2).show()
df.select(df.col1 - df.col2).show()
df.select(df.col1 * df.col2).show()
df.select(df.col1 / df.col2).show()
df.select(df.col1 % df.col2).show()
dataDictionary = [
('James',{'hair':'black','eye':'brown'}),
('Michael',{'hair':'brown','eye':None}),
('Robert',{'hair':'red','eye':'black'}),
('Washington',{'hair':'grey','eye':'grey'}),
('Jefferson',{'hair':'brown','eye':''})
]
df3=df.rdd.map(lambda x: \
(x.name,x.properties["hair"],x.properties["eye"])) \
.toDF(["name","hair","eye"])
df3.printSchema()
df3.show()
df.withColumn("hair",df.properties.getItem("hair")) \
.withColumn("eye",df.properties.getItem("eye")) \
.drop("properties") \
.show()
df.withColumn("hair",df.properties["hair"]) \
.withColumn("eye",df.properties["eye"]) \
.drop("properties") \
.show()
# Functions
from pyspark.sql.functions import explode,map_keys,col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x:x[0]).collect()
keyCols = list(map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()
pyspark-convert-map-to-columns.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
dataDictionary = [
('James',{'hair':'black','eye':'brown'}),
('Michael',{'hair':'brown','eye':None}),
('Robert',{'hair':'red','eye':'black'}),
('Washington',{'hair':'grey','eye':'grey'}),
('Jefferson',{'hair':'brown','eye':''})
]
df3=df.rdd.map(lambda x: \
(x.name,x.properties["hair"],x.properties["eye"])) \
.toDF(["name","hair","eye"])
df3.printSchema()
df3.show()
df.withColumn("hair",df.properties.getItem("hair")) \
.withColumn("eye",df.properties.getItem("eye")) \
.drop("properties") \
.show()
df.withColumn("hair",df.properties["hair"]) \
.withColumn("eye",df.properties["eye"]) \
.drop("properties") \
.show()
# Functions
from pyspark.sql.functions import explode,map_keys,col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x:x[0]).collect()
keyCols = list(map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()
pyspark-convert_columns-to-map.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [ ("36636","Finance",3000,"USA"),
("40288","Finance",5000,"IND"),
("42114","Sales",3900,"USA"),
("39192","Marketing",2500,"CAN"),
("34534","Sales",6500,"USA") ]
schema = StructType([
StructField('id', StringType(), True),
StructField('dept', StringType(), True),
StructField('salary', IntegerType(), True),
StructField('location', StringType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)
# Using countDistrinct()
from pyspark.sql.functions import countDistinct
df2=df.select(countDistinct("Dept","Salary"))
df2.show()
print("Distinct Count of Department & Salary: "+ str(df2.collect()[0][0]))
df.createOrReplaceTempView("PERSON")
spark.sql("select distinct(count(*)) from PERSON").show()
pyspark-create-dataframe-dictionary.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
dataDictionary = [
('James',{'hair':'black','eye':'brown'}),
('Michael',{'hair':'brown','eye':None}),
('Robert',{'hair':'red','eye':'black'}),
('Washington',{'hair':'grey','eye':'grey'}),
('Jefferson',{'hair':'brown','eye':''})
]
df3=df.rdd.map(lambda x: \
(x.name,x.properties["hair"],x.properties["eye"])) \
.toDF(["name","hair","eye"])
df3.printSchema()
df3.show()
df.withColumn("hair",df.properties.getItem("hair")) \
.withColumn("eye",df.properties.getItem("eye")) \
.drop("properties") \
.show()
df.withColumn("hair",df.properties["hair"]) \
.withColumn("eye",df.properties["eye"]) \
.drop("properties") \
.show()
# Functions
from pyspark.sql.functions import explode,map_keys,col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x:x[0]).collect()
keyCols = list(map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()
pyspark-create-dataframe.py
# -*- coding: utf-8 -*-
'''
Created on Sat Jan 11 19:38:27 2020
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
rdd = spark.sparkContext.parallelize(data)
dfFromRDD1 = rdd.toDF()
dfFromRDD1.printSchema()
dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()
dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.printSchema()
dfFromData2 = spark.createDataFrame(data).toDF(*columns)
dfFromData2.printSchema()
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType,StructField, StringType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
#Using List
dept = [("Finance",10),
("Marketing",20),
("Sales",30),
("IT",40)
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)
deptSchema = StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])
pyspark-current-date-timestamp.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
data=[["1"]]
df=spark.createDataFrame(data,["id"])
from pyspark.sql.functions import *
#SQL
spark.sql("select current_date(), current_timestamp()") \
.show(truncate=False)
#SQL
spark.sql("select date_format(current_date(),'MM-dd-yyyy') as date_format ," + \
"to_timestamp(current_timestamp(),'MM-dd-yyyy HH mm ss SSS') as to_timestamp") \
.show(truncate=False)
pyspark-dataframe-flatMap.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
("Robert,,Williams",["CSharp","VB"],"NV")]
df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)
#Flatmap
pyspark-dataframe-repartition.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com') \
.master("local[5]").getOrCreate()
df=spark.range(0,20)
print(df.rdd.getNumPartitions())
df.write.mode("overwrite").csv("c:/tmp/partition.csv")
df2 = df.repartition(6)
print(df2.rdd.getNumPartitions())
df3 = df.coalesce(2)
print(df3.rdd.getNumPartitions())
df4 = df.groupBy("id").count()
print(df4.rdd.getNumPartitions())
pyspark-dataframe.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
print(spark)
pyspark-date-string.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df=spark.createDataFrame([["1"]],["id"])
df.select(current_date().alias("current_date"), \
date_format(current_date(),"yyyy MM dd").alias("yyyy MM dd"), \
date_format(current_timestamp(),"MM/dd/yyyy hh:mm").alias("MM/dd/yyyy"), \
date_format(current_timestamp(),"yyyy MMM dd").alias("yyyy MMMM dd"), \
date_format(current_timestamp(),"yyyy MMMM dd E").alias("yyyy MMMM dd E") \
).show()
#SQL
#current_date()
df.select(current_date().alias("current_date")
).show(1)
#date_format()
df.select(col("input"),
date_format(col("input"), "MM-dd-yyyy").alias("date_format")
).show()
#to_date()
df.select(col("input"),
to_date(col("input"), "yyy-MM-dd").alias("to_date")
).show()
#datediff()
df.select(col("input"),
datediff(current_date(),col("input")).alias("datediff")
).show()
#months_between()
df.select(col("input"),
months_between(current_date(),col("input")).alias("months_between")
).show()
#trunc()
df.select(col("input"),
trunc(col("input"),"Month").alias("Month_Trunc"),
trunc(col("input"),"Year").alias("Month_Year"),
trunc(col("input"),"Month").alias("Month_Trunc")
).show()
df.select(col("input"),
year(col("input")).alias("year"),
month(col("input")).alias("month"),
next_day(col("input"),"Sunday").alias("next_day"),
weekofyear(col("input")).alias("weekofyear")
).show()
df.select(col("input"),
dayofweek(col("input")).alias("dayofweek"),
dayofmonth(col("input")).alias("dayofmonth"),
dayofyear(col("input")).alias("dayofyear"),
).show()
#current_timestamp()
df2.select(current_timestamp().alias("current_timestamp")
).show(1,truncate=False)
#to_timestamp()
df2.select(col("input"),
to_timestamp(col("input"), "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp")
).show(truncate=False)
#hour, minute,second
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01
12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])
df3.select(col("input"),
hour(col("input")).alias("hour"),
minute(col("input")).alias("minute"),
second(col("input")).alias("second")
).show(truncate=False)
pyspark-datediff.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
df=spark.createDataFrame(data=data,schema=["id","date"])
df.select(
col("date"),
current_date().alias("current_date"),
datediff(current_date(),col("date")).alias("datediff")
).show()
df.withColumn("datesDiff", datediff(current_date(),col("date"))) \
.withColumn("montsDiff", months_between(current_date(),col("date"))) \
.withColumn("montsDiff_round",round(months_between(current_date(),col("date")),2)) \
.withColumn("yearsDiff",months_between(current_date(),col("date"))/lit(12)) \
.withColumn("yearsDiff_round",round(months_between(current_date(),col("date"))/lit(12),2)) \
.show()
data2 = [("1","07-01-2019"),("2","06-24-2019"),("3","08-24-2019")]
df2=spark.createDataFrame(data=data2,schema=["id","date"])
df2.select(
to_date(col("date"),"MM-dd-yyyy").alias("date"),
current_date().alias("endDate")
)
#SQL
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
distinctDF = df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)
dropDisDF = df.dropDuplicates(["department","salary"])
print("Distinct count of department salary : "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)
pyspark-drop-column.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = (("James","","Smith","36636","NewYork",3100), \
("Michael","Rose","","40288","California",4300), \
("Robert","","Williams","42114","Florida",1400), \
("Maria","Anne","Jones","39192","Florida",5500), \
("Jen","Mary","Brown","34561","NewYork",3000) \
)
columns= ["firstname","middlename","lastname","id","location","salary"]
df.printSchema()
df.show(truncate=False)
df.drop("firstname") \
.printSchema()
df.drop(col("firstname")) \
.printSchema()
df.drop(df.firstname) \
.printSchema()
df.drop("firstname","middlename","lastname") \
.printSchema()
cols = ("firstname","middlename","lastname")
df.drop(*cols) \
.printSchema()
pyspark-drop-null.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
filePath="resources/small_zipcode.csv"
df = spark.read.options(header='true', inferSchema='true') \
.csv(filePath)
df.printSchema()
df.show(truncate=False)
df.na.drop().show(truncate=False)
df.na.drop(how="any").show(truncate=False)
df.na.drop(subset=["population","type"]) \
.show(truncate=False)
df.dropna().show(truncate=False)
pyspark-empty-data-frame.py
# -*- coding: utf-8 -*-
'''
Created on Sat Jan 11 19:38:27 2020
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
schema = StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])
df = spark.createDataFrame(spark.sparkContext.emptyRDD(),schema)
df.printSchema()
df1 = spark.sparkContext.parallelize([]).toDF(schema)
df1.printSchema()
df3 = spark.emptyDataFrame()
df3.printSchema()
pyspark-explode-array-map.py
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 24 22:42:50 2019
@author: Naveen
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()
arrayData = [
('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
('Robert',['CSharp',''],{'hair':'red','eye':''}),
('Washington',None,None),
('Jefferson',['1','2'],{})
]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show()
"""END"""
pyspark-explode-nested-array.py
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 24 22:42:50 2019
@author: Naveen
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, flatten
spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()
arrayArrayData = [
("James",[["Java","Scala","C++"],["Spark","Java"]]),
("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
("Robert",[["CSharp","VB"],["Spark","Python"]])
]
""" """
df.select(df.name,explode(df.subjects)).show(truncate=False)
"""END"""
pyspark-expr.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Add
df.select(df.date,df.increment,
expr("increment + 5 as new_increment")
).show()
data = [
("James",None,"M"),
("Anna","NY","F"),
("Julia",None,None)
]
columns = ["name","state","gender"]
df =spark.createDataFrame(data,columns)
df.printSchema()
df.show()
df.filter("state is NULL").show()
df.filter(df.state.isNull()).show()
df.filter(col("state").isNull()).show()
df.createOrReplaceTempView("DATA")
spark.sql("SELECT * FROM DATA where STATE IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NOT NULL").show()
pyspark-filter.py
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 13 21:08:30 2020
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col,array_contains
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
arrayStructureData = [
(("James","","Smith"),["Java","Scala","C++"],"OH","M"),
(("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
(("Julia","","Williams"),["CSharp","VB"],"OH","F"),
(("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
(("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
(("Mike","Mary","Williams"),["Python","VB"],"OH","M")
]
arrayStructureSchema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('languages', ArrayType(StringType()), True),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
#Equals
df.filter(df.state == "OH") \
.show(truncate=False)
#Not equals
df.filter(~(df.state == "OH")) \
.show(truncate=False)
df.filter(df.state != "OH") \
.show(truncate=False)
df.filter(col("state") == "OH") \
.show(truncate=False)
df.filter("gender == 'M'") \
.show(truncate=False)
#IS IN
li=["OH","CA","DE"]
df.filter(df.state.isin(li)).show()
#IS NOT IN
df.filter(~df.state.isin(li)).show()
df.filter(array_contains(df.languages,"Java")) \
.show(truncate=False)
df.filter(df.name.lastname == "Williams") \
.show(truncate=False)
df.filter(df.state.startswith("N")).show()
df.filter(df.state.endswith("H")).show()
df.filter(df.state.like("N%")).show()
==========
pyspark-filter2.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df2.filter(df2.name.like("%rose%")).show()
df2.filter(df2.name.rlike("(?i)^*rose$")).show()
pyspark-fulter-null.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
data = [
("James",None,"M"),
("Anna","NY","F"),
("Julia",None,None)
]
columns = ["name","state","gender"]
df = spark.createDataFrame(data,columns)
df.show()
df.filter("state is NULL").show()
df.filter(df.state.isNull()).show()
df.filter(col("state").isNull()).show()
df.na.drop("state").show()
pyspark-groupby-sort.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000),
("Michael","Sales","NV",86000,56,20000),
("Robert","Sales","CA",81000,30,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","DE",99000,40,24000),
("Scott","Finance","NY",83000,36,19000),
("Jen","Finance","NY",79000,53,15000),
("Jeff","Marketing","NV",80000,25,18000),
("Kumar","Marketing","NJ",91000,50,21000)
]
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)
df.groupBy("state").sum("salary").show()
dfGroup=df.groupBy("state") \
.agg(sum("salary").alias("sum_salary"))
dfGroup.show(truncate=False)
df.groupBy("state") \
.agg(sum("salary").alias("sum_salary")) \
.filter(col("sum_salary") > 100000) \
.sort(desc("sum_salary")) \
.show()
df.createOrReplaceTempView("EMP")
spark.sql("select state, sum(salary) as sum_salary from EMP " +
"group by state having sum_salary > 100000 " +
"order by sum_salary desc").show()
df.groupBy("state") \
.sum("salary") \
.withColumnRenamed("sum(salary)", "sum_salary") \
.show()
df.groupBy("state") \
.sum("salary") \
.select(col("state"),col("sum(salary)").alias("sum_salary")) \
.show()
pyspark-groupby.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
@author: prabha
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg,max
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000),
("Michael","Sales","NY",86000,56,20000),
("Robert","Sales","CA",81000,30,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","CA",99000,40,24000),
("Scott","Finance","NY",83000,36,19000),
("Jen","Finance","NY",79000,53,15000),
("Jeff","Marketing","CA",80000,25,18000),
("Kumar","Marketing","NY",91000,50,21000)
]
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)
df.groupBy("department").sum("salary").show(truncate=False)
df.groupBy("department").count().show(truncate=False)
df.groupBy("department","state") \
.sum("salary","bonus") \
.show(truncate=False)
df.groupBy("department") \
.agg(sum("salary").alias("sum_salary"), \
avg("salary").alias("avg_salary"), \
sum("bonus").alias("sum_bonus"), \
max("bonus").alias("max_bonus") \
)\
.show(truncate=False)
df.groupBy("department") \
.agg(sum("salary").alias("sum_salary"), \
avg("salary").alias("avg_salary"), \
sum("bonus").alias("sum_bonus"), \
max("bonus").alias("max_bonus")) \
.where(col("sum_bonus") >= 50000) \
.show(truncate=False)
pyspark-join-two-dataframes.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
#EMP DataFrame
empData = [(1,"Smith",10), (2,"Rose",20),
(3,"Williams",10), (4,"Jones",30)
]
empColumns = ["emp_id","name","emp_dept_id"]
empDF = spark.createDataFrame(empData,empColumns)
empDF.show()
#DEPT DataFrame
deptData = [("Finance",10), ("Marketing",20),
("Sales",30),("IT",40)
]
deptColumns = ["dept_name","dept_id"]
deptDF=spark.createDataFrame(deptData,deptColumns)
deptDF.show()
#Address DataFrame
addData=[(1,"1523 Main St","SFO","CA"),
(2,"3453 Orange St","SFO","NY"),
(3,"34 Warner St","Jersey","NJ"),
(4,"221 Cavalier St","Newark","DE"),
(5,"789 Walnut St","Sandiago","CA")
]
addColumns = ["emp_id","addline1","city","state"]
addDF = spark.createDataFrame(addData,addColumns)
addDF.show()
#SQL
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
addDF.createOrReplaceTempView("ADD")
#
df1 = spark.createDataFrame(
[(1, "A"), (2, "B"), (3, "C")],
["A1", "A2"])
df2 = spark.createDataFrame(
[(1, "F"), (2, "B")],
["B1", "B2"])
@author: prabha
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
emp = [(1,"Smith",-1,"2018","10","M",3000), \
(2,"Rose",1,"2010","20","M",4000), \
(3,"Williams",1,"2010","10","M",1000), \
(4,"Jones",2,"2005","10","F",2000), \
(5,"Brown",2,"2010","40","",-1), \
(6,"Brown",2,"2010","50","",-1) \
]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
"emp_dept_id","gender","salary"]
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"inner") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"outer") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"full") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"fullouter") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"left") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftouter") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"right") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"rightouter") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftsemi") \
.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftanti") \
.show(truncate=False)
empDF.alias("emp1").join(empDF.alias("emp2"), \
col("emp1.superior_emp_id") == col("emp2.emp_id"),"inner") \
.select(col("emp1.emp_id"),col("emp1.name"), \
col("emp2.emp_id").alias("superior_emp_id"), \
col("emp2.name").alias("superior_emp_name")) \
.show(truncate=False)
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
========
pyspark-left-anti-join.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('machinelearninggeeks.com').getOrCreate()
emp = [(1,"Smith",-1,"2018","10","M",3000), \
(2,"Rose",1,"2010","20","M",4000), \
(3,"Williams",1,"2010","10","M",1000), \
(4,"Jones",2,"2005","10","F",2000), \
(5,"Brown",2,"2010","40","",-1), \
(6,"Brown",2,"2010","50","",-1) \
]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
"emp_dept_id","gender","salary"]
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"left").show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,"leftouter").show(truncate=False)
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
joinDF2 = spark.sql("SELECT e.* FROM EMP e LEFT ANTI JOIN DEPT d ON e.emp_dept_id ==
d.dept_id") \
.show(truncate=False)
pyspark-lit.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("111",50000),("222",60000),("333",40000)]
columns= ["EmpId","Salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)
data = [('James','Smith','M',30),
('Anna','Rose','F',41),
('Robert','Williams','M',62),
]
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
from pyspark.sql.functions import concat_ws,col,lit
df.select(concat_ws(",",df.firstname,df.lastname).alias("name"), \
df.gender,lit(df.salary*2).alias("new_salary")).show()
print(df.collect())
rdd=df.rdd.map(lambda x:
(x[0]+","+x[1],x[2],x[3]*2)
)
df2=rdd.toDF(["name","gender","new_salary"] )
df2.show()
def func1(x):
firstName=x.firstname
lastName=x.lastName
name=firstName+","+lastName
gender=x.gender.lower()
salary=x.salary*2
return (name,gender,salary)
rdd2=df.rdd.map(lambda x: func1(x))
#Foeeach example
def f(x): print(x)
df.rdd.foreach(f)
df.rdd.foreach(lambda x:
print("Data ==>"+x["firstname"]+","+x["lastname"]+","+x["gender"]+","+str(x["salary"]*2))
)
dataCollect=df.rdd.toLocalIterator()
for row in dataCollect:
print(row['firstname'] + "," +row['lastname'])
import pandas as pd
pandasDF = df.toPandas()
for index, row in pandasDF.iterrows():
print(row['firstname'], row['gender'])
pyspark-mappartitions.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
#Example 1 mapPartitions()
def reformat(partitionData):
for row in partitionData:
yield [row.firstname+","+row.lastname,row.salary*10/100]
df.rdd.mapPartitions(reformat).toDF().show()
#Example 2 mapPartitions()
def reformat2(partitionData):
updatedData = []
for row in partitionData:
name=row.firstname+","+row.lastname
bonus=row.salary*10/100
updatedData.append([name,bonus])
return iter(updatedData)
df2=df.rdd.mapPartitions(reformat2).toDF("name","bonus")
df2.show()
#Example 1 mapPartitions()
def reformat(partitionData):
for row in partitionData:
yield [row.firstname+","+row.lastname,row.salary*10/100]
df.rdd.mapPartitions(reformat).toDF().show()
#Example 2 mapPartitions()
def reformat2(partitionData):
updatedData = []
for row in partitionData:
name=row.firstname+","+row.lastname
bonus=row.salary*10/100
updatedData.append([name,bonus])
return iter(updatedData)
df2=df.rdd.mapPartitions(reformat2).toDF("name","bonus")
df2.show()
pyspark-mappartitions.py
pyspark-maptype-dataframe-column.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
dataDictionary = [
('James',{'hair':'black','eye':'brown'}),
('Michael',{'hair':'brown','eye':None}),
('Robert',{'hair':'red','eye':'black'}),
('Washington',{'hair':'grey','eye':'grey'}),
('Jefferson',{'hair':'brown','eye':''})
]
df3=df.rdd.map(lambda x: \
(x.name,x.properties["hair"],x.properties["eye"])) \
.toDF(["name","hair","eye"])
df3.printSchema()
df3.show()
df.withColumn("hair",df.properties.getItem("hair")) \
.withColumn("eye",df.properties.getItem("eye")) \
.drop("properties") \
.show()
df.withColumn("hair",df.properties["hair"]) \
.withColumn("eye",df.properties["eye"]) \
.drop("properties") \
.show()
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000),
("Michael","Sales","NV",86000,56,20000),
("Robert","Sales","CA",81000,30,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","DE",99000,40,24000),
("Scott","Finance","NY",83000,36,19000),
("Jen","Finance","NY",79000,53,15000),
("Jeff","Marketing","NV",80000,25,18000),
("Kumar","Marketing","NJ",91000,50,21000)
]
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)
dfSort=df.sort(df.state,df.salary).groupBy(df.state).agg(sum(df.salary))
dfSort.show()
pyspark-orderby.py
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 20 07:45:04 2020
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, asc,desc
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000), \
("Michael","Sales","NY",86000,56,20000), \
("Robert","Sales","CA",81000,30,23000), \
("Maria","Finance","CA",90000,24,23000), \
("Raman","Finance","CA",99000,40,24000), \
("Scott","Finance","NY",83000,36,19000), \
("Jen","Finance","NY",79000,53,15000), \
("Jeff","Marketing","CA",80000,25,18000), \
("Kumar","Marketing","NY",91000,50,21000) \
]
columns= ["employee_name","department","state","salary","age","bonus"]
df.printSchema()
df.show(truncate=False)
df.sort("department","state").show(truncate=False)
df.sort(col("department"),col("state")).show(truncate=False)
df.orderBy("department","state").show(truncate=False)
df.orderBy(col("department"),col("state")).show(truncate=False)
df.sort(df.department.asc(),df.state.asc()).show(truncate=False)
df.sort(col("department").asc(),col("state").asc()).show(truncate=False)
df.orderBy(col("department").asc(),col("state").asc()).show(truncate=False)
df.sort(df.department.asc(),df.state.desc()).show(truncate=False)
df.sort(col("department").asc(),col("state").desc()).show(truncate=False)
df.orderBy(col("department").asc(),col("state").desc()).show(truncate=False)
df.createOrReplaceTempView("EMP")
df.select("employee_name",asc("department"),desc("state"),"salary","age","bonus").show(truncate=
False)
pyspark-parallelize.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
rdd=spark.sparkContext.parallelize([1,2,3,4,5])
rddCollect = rdd.collect()
print("Number of Partitions: "+str(rdd.getNumPartitions()))
print("Action: First element: "+str(rdd.first()))
print(rddCollect)
emptyRDD = spark.sparkContext.emptyRDD()
emptyRDD2 = rdd=spark.sparkContext.parallelize([])
print(""+str(emptyRDD2.isEmpty()))
pyspark-partitionby.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df=spark.read.option("header",True) \
.csv("C:/apps/sparkbyexamples/src/pyspark-examples/resources/simple-zipcodes.csv")
df.show()
print(df.rdd.getNumPartitions())
df.write.option("header",True) \
.partitionBy("state") \
.mode("overwrite") \
.csv("c:/tmp/zipcodes-state")
df.write.option("header",True) \
.partitionBy("state","city") \
.mode("overwrite") \
.csv("c:/tmp/zipcodes-state-city")
df=df.repartition(2)
print(df.rdd.getNumPartitions())
df.write.option("header",True) \
.partitionBy("state") \
.mode("overwrite") \
.csv("c:/tmp/zipcodes-state-more")
dfPartition=spark.read.option("header",True)\
.csv("c:/tmp/zipcodes-state")
dfPartition.printSchema()
dfSinglePart=spark.read.option("header",True) \
.csv("c:/tmp/zipcodes-state/state=AL/city=SPRINGVILLE")
dfSinglePart.printSchema()
dfSinglePart.show()
parqDF = spark.read.option("header",True) \
.csv("c:/tmp/zipcodes-state")
parqDF.createOrReplaceTempView("ZIPCODE")
spark.sql("select * from ZIPCODE where state='AL' and city = 'SPRINGVILLE'") \
.show()
df.write.option("header",True) \
.option("maxRecordsPerFile", 2) \
.partitionBy("state") \
.mode("overwrite") \
.csv("/tmp/zipcodes-state-maxrecords")
pyspark-pivot.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)
pivotDF = df.groupBy("Product","Country") \
.sum("Amount") \
.groupBy("Product") \
.pivot("Country") \
.sum("sum(Amount)")
pivotDF.printSchema()
pivotDF.show(truncate=False)
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
rdd=spark.sparkContext.parallelize(dept)
print(rdd)
dataColl=rdd.collect()
dataCollect = deptDF.collect()
print(dataCollect)
dataCollect2 = deptDF.select("dept_name").collect()
print(dataCollect2)
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","","Smith","36636","M",60000),
("Michael","Rose","","40288","M",70000),
("Robert","","Williams","42114","",400000),
("Maria","Anne","Jones","39192","F",500000),
("Jen","Mary","Brown","","F",0)]
columns = ["first_name","middle_name","last_name","dob","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)
pandasDF = pysparkDF.toPandas()
print(pandasDF)
schemaStruct = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('dob', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', StringType(), True)
])
pandasDF2 = df.toPandas()
print(pandasDF2)
pyspark-range-partition.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [(1,10),(2,20),(3,10),(4,20),(5,10),
(6,30),(7,50),(8,50),(9,50),(10,30),
(11,10),(12,10),(13,40),(14,40),(15,40),
(16,40),(17,50),(18,10),(19,40),(20,40)
]
df=spark.createDataFrame(data,["id","value"])
df.repartition(3,"value").explain(True)
df.repartition("value") \
.write.option("header",True) \
.mode("overwrite") \
.csv("c:/tmp/range-partition")
df.repartitionByRange("value").explain(True)
df.repartitionByRange(3,"value").explain(True)
df.repartitionByRange(3,"value") \
.write.option("header",True) \
.mode("overwrite") \
.csv("c:/tmp/range-partition-count")
pyspark-rdd-actions.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
listRdd = spark.sparkContext.parallelize([1,2,3,4,5,3,2])
#aggregate
seqOp = (lambda x, y: x + y)
combOp = (lambda x, y: x + y)
agg=listRdd.aggregate(0, seqOp, combOp)
print(agg) # output 20
#aggregate 2
seqOp2 = (lambda x, y: (x[0] + y, x[1] + 1))
combOp2 = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
agg2=listRdd.aggregate((0, 0), seqOp2, combOp2)
print(agg2) # output (20,7)
agg2=listRdd.treeAggregate(0,seqOp, combOp)
print(agg2) # output 20
#fold
from operator import add
foldRes=listRdd.fold(0, add)
print(foldRes) # output 20
#reduce
redRes=listRdd.reduce(add)
print(redRes) # output 20
#Collect
data = listRdd.collect()
print(data)
#countByValue, countByValueApprox
print("countByValue : "+str(listRdd.countByValue()))
#first
print("first : "+str(listRdd.first()))
#Output: first : 1
print("first : "+str(inputRDD.first()))
#Output: first : (Z,1)
#top
print("top : "+str(listRdd.top(2)))
#Output: take : 5,4
print("top : "+str(inputRDD.top(2)))
#Output: take : (Z,1),(C,40)
#min
print("min : "+str(listRdd.min()))
#Output: min : 1
print("min : "+str(inputRDD.min()))
#Output: min : (A,20)
#max
print("max : "+str(listRdd.max()))
#Output: max : 5
print("max : "+str(inputRDD.max()))
#Output: max : (Z,1)
pyspark-rdd-broadcast.py
# -*- coding: utf-8 -*-
'''
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","Smith","USA","CA"),
("Michael","Rose","USA","NY"),
("Robert","Williams","USA","CA"),
("Maria","Jones","USA","FL")
]
rdd = spark.sparkContext.parallelize(data)
def state_convert(code):
return broadcastStates.value[code]
#Flatmap
rdd2=rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
print(element)
pyspark-rdd-map.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
data = ["Project",
"Gutenberg’s",
"Alice’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s"]
rdd=spark.sparkContext.parallelize(data)
rdd2=rdd.map(lambda x: (x,1))
for element in rdd2.collect():
print(element)
data = [('James','Smith','M',30),
('Anna','Rose','F',41),
('Robert','Williams','M',62),
]
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
rdd2=df.rdd.map(lambda x:
(x[0]+","+x[1],x[2],x[3]*2)
)
df2=rdd2.toDF(["name","gender","new_salary"] )
df2.show()
def func1(x):
firstName=x.firstname
lastName=x.lastname
name=firstName+","+lastName
gender=x.gender.lower()
salary=x.salary*2
return (name,gender,salary)
rdd2=df.rdd.map(lambda x: func1(x)).toDF().show()
rdd2=df.rdd.map(func1).toDF().show()
pyspark-rdd-reduceByKey.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
rdd=spark.sparkContext.parallelize(data)
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
dept = [("Finance",10),
("Marketing",20),
("Sales",30),
("IT",40)
]
rdd = spark.sparkContext.parallelize(dept)
df = rdd.toDF()
df.printSchema()
df.show(truncate=False)
deptColumns = ["dept_name","dept_id"]
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)
pyspark-rdd-wordcount-2.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
#Flatmap
rdd2=rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
print(element)
#map
rdd3=rdd2.map(lambda x: (x,1))
for element in rdd3.collect():
print(element)
#reduceByKey
rdd4=rdd3.reduceByKey(lambda a,b: a+b)
for element in rdd4.collect():
print(element)
#map
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
for element in rdd5.collect():
print(element)
#filter
rdd6 = rdd5.filter(lambda x : 'a' in x[1])
for element in rdd6.collect():
print(element)
#Flatmap
rdd2=rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
print(element)
#map
rdd3=rdd2.map(lambda x: (x,1))
for element in rdd3.collect():
print(element)
#reduceByKey
rdd4=rdd3.reduceByKey(lambda a,b: a+b)
for element in rdd4.collect():
print(element)
#map
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
for element in rdd5.collect():
print(element)
#filter
rdd6 = rdd5.filter(lambda x : 'a' in x[1])
for element in rdd6.collect():
print(element)
pyspark-rdd.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
print(spark)
rdd=spark.sparkContext.parallelize([1,2,3,4,56])
print("RDD count :"+str(rdd.count()))
rdd = spark.sparkContext.emptyRDD
print(rdd)
rdd2 = spark.sparkContext.parallelize([])
print(rdd2)
pyspark-read-csv.py
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 13 21:08:30 2020
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df = spark.read.csv("C:/apps/sparkbyexamples/src/pyspark-examples/resources/zipcodes.csv")
df.printSchema()
df2 = spark.read.option("header",True) \
.csv("C:/apps/sparkbyexamples/src/pyspark-examples/resources/zipcodes.csv")
df2.printSchema()
schema = StructType() \
.add("RecordNumber",IntegerType(),True) \
.add("Zipcode",IntegerType(),True) \
.add("ZipCodeType",StringType(),True) \
.add("City",StringType(),True) \
.add("State",StringType(),True) \
.add("LocationType",StringType(),True) \
.add("Lat",DoubleType(),True) \
.add("Long",DoubleType(),True) \
.add("Xaxis",IntegerType(),True) \
.add("Yaxis",DoubleType(),True) \
.add("Zaxis",DoubleType(),True) \
.add("WorldRegion",StringType(),True) \
.add("Country",StringType(),True) \
.add("LocationText",StringType(),True) \
.add("Location",StringType(),True) \
.add("Decommisioned",BooleanType(),True) \
.add("TaxReturnsFiled",StringType(),True) \
.add("EstimatedPopulation",IntegerType(),True) \
.add("TotalWages",IntegerType(),True) \
.add("Notes",StringType(),True)
df_with_schema = spark.read.format("csv") \
.option("header", True) \
.schema(schema) \
.load("C:/apps/sparkbyexamples/src/pyspark-examples/resources/zipcodes.csv")
df_with_schema.printSchema()
df2.write.option("header",True) \
.csv("/tmp/spark_output/zipcodes123")
pyspark-read-json.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType,
IntegerType,BooleanType,DoubleType
spark = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExamples.com") \
.getOrCreate()
df_with_schema = spark.read.schema(schema) \
.json("resources/zipcodes.json")
df_with_schema.printSchema()
df_with_schema.show()
@author: sparkbyexamples.com
'''
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
(('Michael','Rose',''),'2000-05-19','M',4000),
(('Robert','','Williams'),'1978-09-05','M',4000),
(('Maria','Anne','Jones'),'1967-12-01','F',4000),
(('Jen','Mary','Brown'),'1980-02-17','F',-1)
]
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('dob', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', IntegerType(), True)
])
df.select(col("name").cast(schema2),
col("dob"),
col("gender"),
col("salary")) \
.printSchema()
''' Example 6
not working
val old_columns = Seq("dob","gender","salary","fname","mname","lname")
val new_columns = Seq("DateOfBirth","Sex","salary","firstName","middleName","lastName")
val columnsList = old_columns.zip(new_columns).map(f=>{col(f._1).as(f._2)})
val df5 = df4.select(columnsList:_*)
df5.printSchema()
'''
''' Example 7
not working
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
df.toDF(newColumns) \
.printSchema()
'''
pyspark-repace-null.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExamples.com") \
.getOrCreate()
filePath="resources/small_zipcode.csv"
df = spark.read.options(header='true', inferSchema='true') \
.csv(filePath)
df.printSchema()
df.show(truncate=False)
df.fillna(value=0).show()
df.fillna(value=0,subset=["population"]).show()
df.na.fill(value=0).show()
df.na.fill(value=0,subset=["population"]).show()
df.fillna(value="").show()
df.na.fill(value="").show()
df.fillna("unknown",["city"]) \
.fillna("",["type"]).show()
df.na.fill("unknown",["city"]) \
.na.fill("",["type"]).show()
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df=spark.read.option("header",True) \
.csv("C:/apps/sparkbyexamples/src/pyspark-examples/resources/simple-zipcodes.csv")
newDF=df.repartition(3)
print(newDF.rdd.getNumPartitions())
newDF.write.option("header",True).mode("overwrite") \
.csv("/tmp/zipcodes-state")
df2=df.repartition(3,"state")
df2.write.option("header",True).mode("overwrite") \
.csv("/tmp/zipcodes-state-3states")
df3=df.repartition("state")
df3.write.option("header",True).mode("overwrite") \
.csv("/tmp/zipcodes-state-allstates")
pyspark-repartition.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com') \
.master("local[5]").getOrCreate()
df = spark.range(0,20)
print(df.rdd.getNumPartitions())
spark.conf.set("spark.sql.shuffle.partitions", "500")
rdd = spark.sparkContext.parallelize((0,20))
print("From local[5]"+str(rdd.getNumPartitions()))
rdd1 = spark.sparkContext.parallelize((0,25), 6)
print("parallelize : "+str(rdd1.getNumPartitions()))
"""rddFromFile = spark.sparkContext.textFile("src/main/resources/test.txt",10)
print("TextFile : "+str(rddFromFile.getNumPartitions())) """
rdd1.saveAsTextFile("c://tmp/partition2")
rdd2 = rdd1.repartition(4)
print("Repartition size : "+str(rdd2.getNumPartitions()))
rdd2.saveAsTextFile("c://tmp/re-partition2")
rdd3 = rdd1.coalesce(4)
print("Repartition size : "+str(rdd3.getNumPartitions()))
rdd3.saveAsTextFile("c:/tmp/coalesce2")
pyspark-row.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
row=Row("James",40)
print(row[0] +","+str(row[1]))
row2=Row(name="Alice", age=11)
print(row2.name)
#PySpark Example
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
rdd2 = spark.sparkContext.parallelize([],10)
data = [Row(name="James,,Smith",lang=["Java","Scala","C++"],state="CA"),
Row(name="Michael,Rose,",lang=["Spark","Java","C++"],state="NJ"),
Row(name="Robert,,Williams",lang=["CSharp","VB"],state="NV")]
#RDD Example 1
rdd=spark.sparkContext.parallelize(data)
collData=rdd.collect()
print(collData)
for row in collData:
print(row.name + "," +str(row.lang))
# RDD Example 2
Person=Row("name","lang","state")
data = [Person("James,,Smith",["Java","Scala","C++"],"CA"),
Person("Michael,Rose,",["Spark","Java","C++"],"NJ"),
Person("Robert,,Williams",["CSharp","VB"],"NV")]
rdd=spark.sparkContext.parallelize(data)
collData=rdd.collect()
print(collData)
for person in collData:
print(person.name + "," +str(person.lang))
#DataFrame Example 1
columns = ["name","languagesAtSchool","currentState"]
df=spark.createDataFrame(data)
df.printSchema()
df.show()
collData=df.collect()
print(collData)
for row in collData:
print(row.name + "," +str(row.lang))
#DataFrame Example 2
data = [("James,,Smith",["Java","Scala","C++"],"CA"),
("Michael,Rose,",["Spark","Java","C++"],"NJ"),
("Robert,,Williams",["CSharp","VB"],"NV")]
columns = ["name","languagesAtSchool","currentState"]
df=spark.createDataFrame(data).toDF(*columns)
df.printSchema()
for row in df.collect():
print(row.name)
pyspark-sampling.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
df=spark.range(100)
'''sample() '''
print(df.sample(0.06).collect())
print(df.sample(0.1,123).collect())
print(df.sample(0.1,123).collect())
print(df.sample(0.1,456).collect())
print("withReplacement Examples")
print(df.sample(True,0.3,123).collect())
print(df.sample(0.3,123).collect())
'''sampleBy() '''
print("sampleBy Examples")
df2=df.select((df.id % 3).alias("key"))
print(df2.sampleBy("key", {0: 0.1, 1: 0.2},0).collect())
print("RDD Examples")
'''RDD'''
rdd = spark.sparkContext.range(0,100)
print(rdd.sample(False,0.1,0).collect())
print(rdd.sample(True,0.3,123).collect())
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","Smith","USA","CA"),
("Michael","Rose","USA","NY"),
("Robert","Williams","USA","CA"),
("Maria","Jones","USA","FL")
]
columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)
df.select("firstname","lastname").show()
df.select(df.columns[:3]).show(3)
df.select(df.columns[2:4]).show(3)
df.select(df.colRegex("`^.*name*`")).show()
data = [
(("James",None,"Smith"),"OH","M"),
(("Anna","Rose",""),"NY","F"),
(("Julia","","Williams"),"OH","F"),
(("Maria","Anne","Jones"),"NY","M"),
(("Jen","Mary","Brown"),"NY","M"),
(("Mike","Mary","Williams"),"OH","M")
]
spark = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExamples.com") \
.getOrCreate()
data = [('Scott', 50), ('Jeff', 45), ('Thomas', 54),('Ann',34)]
sparkDF=spark.createDataFrame(data,["name","age"])
sparkDF.printSchema()
sparkDF.show()
print((sparkDF.count(), len(sparkDF.columns)))
def sparkShape(dataFrame):
return (dataFrame.count(), len(dataFrame.columns))
import pyspark
pyspark.sql.dataframe.DataFrame.shape = sparkShape
print(sparkDF.shape())
import pandas as pd
pandasDF=sparkDF.toPandas()
print(pandasDF.shape)
pyspark-show-top-n-rows.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James",34),("Ann",34),
("Michael",33),("Scott",53),
("Robert",37),("Chad",27)
]
columns = ["firstname","age",]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show()
#Returns the first ``num`` rows as a :class:`list` of :class:`Row`.
# Internally calls limit and collect
#Action, Return Array[T]
print(df.take(2))
.. note:: This method should only be used if the resulting array is expected
to be small, as all the data is loaded into the driver's memory.
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
print("First SparkContext:");
print("APP Name :"+spark.sparkContext.appName);
print("Master :"+spark.sparkContext.master);
sparkSession2 = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExample-test") \
.getOrCreate();
print("Second SparkContext:")
print("APP Name :"+sparkSession2.sparkContext.appName);
print("Master :"+sparkSession2.sparkContext.master);
sparkSession3 = SparkSession.newSession
print("Second SparkContext:")
print("APP Name :"+sparkSession3.sparkContext.appName);
print("Master :"+sparkSession3.sparkContext.master);
pyspark-sparksession.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
print("First SparkContext:");
print("APP Name :"+spark.sparkContext.appName);
print("Master :"+spark.sparkContext.master);
sparkSession2 = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExample-test") \
.getOrCreate();
print("Second SparkContext:")
print("APP Name :"+sparkSession2.sparkContext.appName);
print("Master :"+sparkSession2.sparkContext.master);
sparkSession3 = SparkSession.newSession
print("Second SparkContext:")
print("APP Name :"+sparkSession3.sparkContext.appName);
print("Master :"+sparkSession3.sparkContext.master);
pyspark-sparksession.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
print("First SparkContext:");
print("APP Name :"+spark.sparkContext.appName);
print("Master :"+spark.sparkContext.master);
sparkSession2 = SparkSession.builder \
.master("local[1]") \
.appName("SparkByExample-test") \
.getOrCreate();
print("Second SparkContext:")
print("APP Name :"+sparkSession2.sparkContext.appName);
print("Master :"+sparkSession2.sparkContext.master);
sparkSession3 = SparkSession.newSession
print("Second SparkContext:")
print("APP Name :"+sparkSession3.sparkContext.appName);
print("Master :"+sparkSession3.sparkContext.master);
pyspark-split-function.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
spark=SparkSession.builder.appName("sparkbyexamples").getOrCreate()
data=data = [('James','','Smith','1991-04-01'),
('Michael','Rose','','2000-05-19'),
('Robert','','Williams','1978-09-05'),
('Maria','Anne','Jones','1967-12-01'),
('Jen','Mary','Brown','1980-02-17')
]
columns=["firstname","middlename","lastname","dob"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)
df1 = df.withColumn('year', split(df['dob'], '-').getItem(0)) \
.withColumn('month', split(df['dob'], '-').getItem(1)) \
.withColumn('day', split(df['dob'], '-').getItem(2))
df1.printSchema()
df1.show(truncate=False)
"""
df4=spark.createDataFrame([("20-13-2012-monday",)], ['date',])
df4.select(split(df4.date,'^([\d]+-[\d]+-[\d])').alias('date'),
regexp_replace(split(df4.date,'^([\d]+-[\d]+-[\d]+)').getItem(1),'-','').alias('day')).show()
"""
df4 = spark.createDataFrame([('oneAtwoBthree',)], ['str',])
df4.select(split(df4.str, '[AB]').alias('str')).show()
df4.select(split(df4.str, '[AB]',2).alias('str')).show()
df4.select(split(df4.str, '[AB]',1).alias('str')).show()
pyspark-sql-case-when.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()
#Using When otherwise
from pyspark.sql.functions import when,col
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
.when(df.gender == "F","Female")
.when(df.gender.isNull() ,"")
.otherwise(df.gender))
df2.show()
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
.when(df.gender == "F","Female")
.when(df.gender.isNull() ,"")
.otherwise(df.gender))
df2=df.select(col("*"),when(df.gender == "M","Male")
.when(df.gender == "F","Female")
.when(df.gender.isNull() ,"")
.otherwise(df.gender).alias("new_gender"))
df2.show()
# Using SQL Case When
from pyspark.sql.functions import expr
df3 = df.withColumn("new_gender", expr("CASE WHEN gender = 'M' THEN 'Male' " +
"WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
"ELSE gender END"))
df3.show()
df.createOrReplaceTempView("EMP")
spark.sql("select name, CASE WHEN gender = 'M' THEN 'Male' " +
"WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
"ELSE gender END as new_gender from EMP").show()
pyspark-string-date.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
#SQL
spark.sql("select to_date('02-03-2013','MM-dd-yyyy') date").show()
==================================
pyspark-string-timestamp.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df=spark.createDataFrame(
data = [ ("1","2019-06-24 12:01:19.000")],
schema=["id","input_timestamp"])
df.printSchema()
columns=["name","dob_year","gender","salary"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)
df.createOrReplaceTempView("PERSON")
spark.sql("select SPLIT(name,',') as NameArray from PERSON") \
.show()
pyspark-struct-to-map.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType
from pyspark.sql.functions import col,struct,when
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [("James","","Smith","36636","M",3000),
("Michael","Rose","","40288","M",4000),
("Robert","","Williams","42114","M",4000),
("Maria","Anne","Jones","39192","F",4000),
("Jen","Mary","Brown","","F",-1)
]
schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("id", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)
structureData = [
(("James","","Smith"),"36636","M",3100),
(("Michael","Rose",""),"40288","M",4300),
(("Robert","","Williams"),"42114","M",1400),
(("Maria","Anne","Jones"),"39192","F",5500),
(("Jen","Mary","Brown"),"","F",-1)
]
structureSchema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('id', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', IntegerType(), True)
])
df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)
updatedDF = df2.withColumn("OtherInfo",
struct(col("id").alias("identifier"),
col("gender").alias("gender"),
col("salary").alias("salary"),
when(col("salary").cast(IntegerType()) < 2000,"Low")
.when(col("salary").cast(IntegerType()) < 4000,"Medium")
.otherwise("High").alias("Salary_Grade")
)).drop("id","gender","salary")
updatedDF.printSchema()
updatedDF.show(truncate=False)
arrayStructureSchema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('hobbies', ArrayType(StringType()), True),
StructField('properties', MapType(StringType(),StringType()), True)
])
pyspark-time-diff.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
dates = [("1","2019-07-01 12:01:19.111"),
("2","2019-06-24 12:01:19.222"),
("3","2019-11-16 16:44:55.406"),
("4","2019-11-16 16:50:59.406")
]
df = spark.createDataFrame(data=dates, schema=["id","from_timestamp"])
df.withColumn('from_timestamp',to_timestamp(col('from_timestamp')))\
.withColumn('end_timestamp', current_timestamp())\
.withColumn('DiffInSeconds',unix_timestamp("end_timestamp") -
unix_timestamp('from_timestamp')) \
.show(truncate=False)
df2.withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\
.show(truncate=False)
df2.withColumn('DiffInHours',round(col('DiffInSeconds')/3600))\
.show(truncate=False)
data= [("12:01:19.000","13:01:19.000"),
("12:01:19.000","12:02:19.000"),
("16:44:55.406","17:44:55.406"),
("16:50:59.406","16:44:59.406")]
df3 = spark.createDataFrame(data=data, schema=["from_timestamp","to_timestamp"])
df3.withColumn("from_timestamp",to_timestamp(col("from_timestamp"),"HH:mm:ss.SSS")) \
.withColumn("to_timestamp",to_timestamp(col("to_timestamp"),"HH:mm:ss.SSS")) \
.withColumn("DiffInSeconds", col("from_timestamp").cast("long") -
col("to_timestamp").cast("long")) \
.withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
.withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
.show(truncate=False)
df3 = spark.createDataFrame(
data=[("1","07-01-2019 12:01:19.406")],
schema=["id","input_timestamp"]
)
df3.withColumn("input_timestamp",to_timestamp(col("input_timestamp"),"MM-dd-yyyy
HH:mm:ss.SSS")) \
.withColumn("current_timestamp",current_timestamp().alias("current_timestamp")) \
.withColumn("DiffInSeconds",current_timestamp().cast("long") -
col("input_timestamp").cast("long")) \
.withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
.withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
.withColumn("DiffInDays",round(col("DiffInSeconds")/24*3600)) \
.show(truncate=False)
#SQL
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df=spark.createDataFrame(
data = [ ("1","2019-06-24 12:01:19.000")],
schema=["id","input_timestamp"])
df.printSchema()
df.withColumn("ts",to_timestamp(col("input_timestamp"))) \
.withColumn("datetype",to_date(col("ts"))) \
.show(truncate=False)
pyspark-types.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
print(mapType.keyType)
print(mapType.valueType)
print(mapType.valueContainsNull)
data = [("James","","Smith","36","M",3000),
("Michael","Rose","","40","M",4000),
("Robert","","Williams","42","M",4000),
("Maria","Anne","Jones","39","F",4000),
("Jen","Mary","Brown","","F",-1)
]
schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("age", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)
pyspark-udf.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
columns = ["Seqno","Name"]
data = [("1", "john jones"),
("2", "tracey smith"),
("3", "amy sanders")]
df = spark.createDataFrame(data=data,schema=columns)
df.show(truncate=False)
def convertCase(str):
resStr=""
arr = str.split(" ")
for x in arr:
resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
return resStr
df.select(col("Seqno"), \
convertUDF(col("Name")).alias("Name") ) \
.show(truncate=False)
@udf(returnType=StringType())
def upperCase(str):
return str.upper()
columns = ["Seqno","Name"]
data = [("1", "john jones"),
("2", "tracey smith"),
("3", "amy sanders"),
('4',None)]
df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE2")
pyspark-union.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000), \
("Michael","Sales","NY",86000,56,20000), \
("Robert","Sales","CA",81000,30,23000), \
("Maria","Finance","CA",90000,24,23000) \
]
columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
simpleData2 = [("James","Sales","NY",90000,34,10000), \
("Maria","Finance","CA",90000,24,23000), \
("Jen","Finance","NY",79000,53,15000), \
("Jeff","Marketing","CA",80000,25,18000), \
("Kumar","Marketing","NY",91000,50,21000) \
]
columns2= ["employee_name","department","state","salary","age","bonus"]
df2.printSchema()
df2.show(truncate=False)
unionDF = df.union(df2)
unionDF.show(truncate=False)
disDF = df.union(df2).distinct()
disDF.show(truncate=False)
unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000), \
("Michael","Sales","NY",86000,56,20000), \
("Robert","Sales","CA",81000,30,23000), \
("Maria","Finance","CA",90000,24,23000) \
]
columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
simpleData2 = [("James","Sales","NY",90000,34,10000), \
("Maria","Finance","CA",90000,24,23000), \
("Jen","Finance","NY",79000,53,15000), \
("Jeff","Marketing","CA",80000,25,18000), \
("Kumar","Marketing","NY",91000,50,21000) \
]
columns2= ["employee_name","department","state","salary","age","bonus"]
df2.printSchema()
df2.show(truncate=False)
unionDF = df.union(df2)
unionDF.show(truncate=False)
disDF = df.union(df2).distinct()
disDF.show(truncate=False)
unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)
pyspark-union.py
pyspark-unix-time.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df3=df2.select(
from_unixtime(col("timestamp_1")).alias("timestamp_1"),
from_unixtime(col("timestamp_2"),"MM-dd-yyyy HH:mm:ss").alias("timestamp_2"),
from_unixtime(col("timestamp_3"),"MM-dd-yyyy").alias("timestamp_3"),
from_unixtime(col("timestamp_4")).alias("timestamp_4")
)
df3.printSchema()
df3.show(truncate=False)
#SQL
pyspark-update-column.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [('James','Smith','M',3000),('Anna','Rose','F',4100),
('Robert','Williams','NA',6200),(None,'Rob','F',6200)
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
df2=df.withColumn("salary", df.salary*3)
df2.show()
df4=df.withColumn("salary",df.salary.cast("String"))
df4.printSchema()
df.createOrReplaceTempView("PER")
df5=spark.sql("select firstname,gender,salary*3 as salary from PER")
df5.show()
pyspark-when-otherwise.py
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 13 21:08:30 2020
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","","Smith","36636","M",60000),
("Michael","Rose","","40288","M",70000),
("Robert","","Williams","42114","",400000),
("Maria","Anne","Jones","39192","F",500000),
("Jen","Mary","Brown","","F",0)]
columns = ["first_name","middle_name","last_name","dob","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)
data2 = [(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4")]
df5 = spark.createDataFrame(data = data2, schema = ["id", "code", "amt"])
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df.printSchema()
df.show(truncate=False)
df.withColumn("row_number",row_number().over(windowSpec)) \
.show(truncate=False)
windowSpecAgg = Window.partitionBy("department")
from pyspark.sql.functions import col,avg,sum,min,max,row_number
df.withColumn("row",row_number().over(windowSpec)) \
.withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
.withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
.withColumn("min", min(col("salary")).over(windowSpecAgg)) \
.withColumn("max", max(col("salary")).over(windowSpecAgg)) \
.where(col("row")==1).select("department","avg","sum","min","max") \
.show()
pyspark-withcolumn.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [('James','','Smith','1991-04-01','M',3000),
('Michael','Rose','','2000-05-19','M',4000),
('Robert','','Williams','1978-09-05','M',4000),
('Maria','Anne','Jones','1967-12-01','F',4000),
('Jen','Mary','Brown','1980-02-17','F',-1)
]
columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.printSchema()
df.show(truncate=False)
df2 = df.withColumn("salary",col("salary").cast("Integer"))
df2.printSchema()
df2.show(truncate=False)
df3 = df.withColumn("salary",col("salary")*100)
df3.printSchema()
df3.show(truncate=False)
df.withColumnRenamed("gender","sex") \
.show(truncate=False)
df4.drop("CopiedColumn") \
.show(truncate=False)
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
(("Michael","Rose",""),"40288","M","4000"), \
(("Robert","","Williams"),"42114","M","4000"), \
(("Maria","Anne","Jones"),"39192","F","4000"), \
(("Jen","Mary","Brown"),"","F","-1") \
]
schemaStruct = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('dob', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', StringType(), True)
])
"""
columns = ["name","address"]
data = [("Robert, Smith", "1 Main st, Newark, NJ, 92537"), \
("Maria, Garcia","3456 Walnut st, Newark, NJ, 94732")]
newDF = dfFromData.map(f=>{
nameSplit = f.getAs[String](0).split(",")
addSplit = f.getAs[String](1).split(",")
(nameSplit(0),nameSplit(1),addSplit(0),addSplit(1),addSplit(2),addSplit(3))
})
finalDF = newDF.toDF("First Name","Last Name",
"Address Line1","City","State","zipCode")
finalDF.printSchema()
finalDF.show(false)
"""
python-pandas.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pandas as pd
data = [["James","","Smith",30,"M",60000],
["Michael","Rose","",50,"M",70000],
["Robert","","Williams",42,"",400000],
["Maria","Anne","Jones",38,"F",500000],
["Jen","Mary","Brown",45,None,0]]
columns = ['First Name', 'Middle Name','Last Name','Age','Gender','Salary']
# print dataframe.
print(pandasDF)
pdCount=pandasDF.count()
print(pdCount)
print(pandasDF.max())
print(pandasDF.mean())
schema.py
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 24 22:42:50 2019
@author: prabha
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,DateType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
schema = StructType([
StructField("city", StringType(), True),
StructField("dates", StringType(), True),
StructField("population", IntegerType(), True)])
# Dataframe:
df = spark.createDataFrame(list(zip(cities, dates, population)), schema=schema)
df.show(truncate=False)
spark-repartition-2.py
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
df=spark.read.option("header",True) \
.csv("C:/apps/sparkbyexamples/src/pyspark-examples/resources/simple-zipcodes.csv")
newDF=df.repartition(3)
print(newDF.rdd.getNumPartitions())
newDF.write.option("header",True).mode("overwrite") \
.csv("/tmp/zipcodes-state")
df2=df.repartition(4,"state")
df2.write.option("header",True).mode("overwrite") \
.csv("/tmp/zipcodes-state-3states")
df3=df.repartition("state")
df3.write.option("header",True).mode("overwrite") \
.csv("/tmp/zipcodes-state-allstates")
timediff.py
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 24 22:42:50 2019
@author: prabha
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round
from pyspark.sql.functions import to_timestamp, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
schema = StructType([
StructField("input_timestamp", StringType(), True)])
df.withColumn('input_timestamp',to_timestamp(col('input_timestamp')))\
.withColumn('current_timestamp', current_timestamp().alias('current_timestamp'))\
.withColumn('DiffInSeconds',current_timestamp().cast(LongType()) -
col('input_timestamp').cast(LongType()))\
.withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\
.withColumn('DiffInHours',round(col('DiffInSeconds')/3600))\
.withColumn('DiffInDays',round(col('DiffInSeconds')/24*3600))\
.show()