Unable to create dataframe from json dstream using pyspark

Question

Unable to create dataframe from json dstream using pyspark

I am trying to create a data frame from json to dstream, but the code below does not give the correct data frame -

import sys
import json
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
def getSqlContextInstance(sparkContext):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']

if __name__ == "__main__":
    if len(sys.argv) != 3:
        raise IOError("Invalid usage; the correct format is:\nquadrant_count.py <hostname> <port>")

# Initialize a SparkContext with a name
spc = SparkContext(appName="jsonread")
sqlContext = SQLContext(spc)
# Create a StreamingContext with a batch interval of 2 seconds
stc = StreamingContext(spc, 2)
# Checkpointing feature
stc.checkpoint("checkpoint")
# Creating a DStream to connect to hostname:port (like localhost:9999)
lines = stc.socketTextStream(sys.argv[1], int(sys.argv[2]))
lines.pprint()
parsed = lines.map(lambda x: json.loads(x))
def process(time, rdd):
    print("========= %s =========" % str(time))
    try:
        # Get the singleton instance of SQLContext
        sqlContext = getSqlContextInstance(rdd.context)
        # Convert RDD[String] to RDD[Row] to DataFrame
        rowRdd = rdd.map(lambda w: Row(word=w))
        wordsDataFrame = sqlContext.createDataFrame(rowRdd)
        # Register as table
        wordsDataFrame.registerTempTable("mytable")
        testDataFrame = sqlContext.sql("select summary from mytable")
        print(testDataFrame.show())
        print(testDataFrame.printSchema())
    except:
        pass
parsed.foreachRDD(process)
stc.start()
# Wait for the computation to terminate
stc.awaitTermination()

There are no errors, but when running the script, it successfully reads json from the streaming context, but does not display the values in the summary or in the dataframe.

The JSON example I'm trying to read is

{"reviewerID": "A2IBPI20UZIR0U", "asin": "1384719342", "reviewerName": " \" , , , , ... "," ": [0, 0], "reviewText": " , . -. . , , , , "," ": 5.0," ":" "," unixReviewTime ": 1393545600, "reviewTime": "02 28, 2014"}

, . .

+5

json python apache-spark pyspark dstream

SUNIL KUMAR C 27 . '16 11:06

:

2480

JSON Curl / Test Spring REST?

1419

DataFrame pandas

1349

Python JSON?

1033