Sample pyspark

Minor modifications to the provided in Spark Docs.

  • Changed builder() to builder
  • Provided the appName: appName(“SimpleApp”)
  • Removed the master(master)
from pyspark.sql import SparkSession

logFile = "/home/hadoop/spark/" # Should be some file on your system
spark = SparkSession.builder.appName("SimpleApp").getOrCreate() # Updated this line
logData =

numAs = logData.filter(logData.value.contains('a')).count()
numBs = logData.filter(logData.value.contains('b')).count()

print("Lines with a: %i, lines with b: %i" % (numAs, numBs))


