I am running the program below on Spark 1.3.1. Spark Streaming scans the directory in HDFS for new files and processes them as they become available. I read that the best way to do this is to move files from an existing HDFS location so that the operation is atomic.
I start my streaming job, add a bunch of small files to a random HDFS directory, and then move these files from the HDFS source directory to the watched HDFS directory (all with simple shell commands). But my streaming work does not recognize them as new files and therefore does not process them.
I am currently using textFileStream, but I am open to use fileStream. However i get errors with thisval lines = ssc.fileStream[LongWritable, Text, TextInputFormat]("hdfs:///name/spark-streaming/data/", (p: Path)=>true, false)
package com.com.spark.prototype
import java.io.FileInputStream
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark._
import org.apache.spark.streaming._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.hadoop.io._
object HLLStreamingHDFSTest {
def functionToCreateContext(): StreamingContext = {
val conf = new SparkConf().set("spark.executor.extraClassPath", "/home/hadoop/spark/conf:/home/hadoop/conf:/home/hadoop/spark/classpath/emr/*:/home/hadoop/spark/classpath/emrfs/*:/home/hadoop/share/hadoop/common/lib/*:/home/hadoop/share/hadoop/common/lib/hadoop-lzo.jar")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("/name/spark-streaming/checkpointing")
val lines = ssc.textFileStream("hdfs:///name/spark-streaming/data/")
val hll = new HyperLogLogMonoid(15)
var globalHll = hll.zero
val users = lines.map(_.toString().toCharArray.map(_.toByte))
val approxUsers = users.mapPartitions(ids => {
ids.map(id => hll(id))
}).reduce(_ + _)
approxUsers.foreachRDD(rdd => {
if (rdd.count() != 0) {
val partial = rdd.first()
globalHll += partial
println()
println()
println("Estimated distinct users this batch: %d".format(partial.estimatedSize.toInt))
println("Estimated distinct users this batch: %d".format(globalHll.estimatedSize.toInt))
println()
println("Approx distinct users this batch: %s".format(partial.approximateSize.toString))
println("Approx distinct users overall: %s".format(globalHll.approximateSize.toString))
}
})
ssc
}
def main(args: Array[String]): Unit = {
val context = StreamingContext.getOrCreate("hdfs:///name/spark-streaming/checkpointing", functionToCreateContext _)
context.start()
context.awaitTermination()
}
}