Data Stream Piping Scaling
Hi Reuven Lax is here. I am a member of the Dataflow development team, where I lead the design and implementation of our streaming runner. Before the data stream, I led the team that built MillWheel over the years. MillWheel was described in this 2013 VLDB article and is the foundation for the streaming technology underlying Dataflow .
, . , . , , -, . - , . , Dataflow. , Tyler Akidaus 101 102 . , VLDB 2015.
, , , . , . : Dataflow, , , . , DirectPipelineRunner, . , - . , . , . , Dataflow, , . , .
, , - , . - , Dataflow , . . .
, .
import javax.json.Json;
...
PCollection<OutType> output = input.apply(ParDo.of(new DoFn<InType, OutType>() {
public void processElement(ProcessContext c) {
JsonReader reader = Json.createReader();
...
}
}));
, , - , .
- , , - . , ssh , . , :
java.util.zip.ZipFile.getEntry(ZipFile.java:308)
java.util.jar.JarFile.getEntry(JarFile.java:240)
java.util.jar.JarFile.getJarEntry(JarFile.java:223)
sun.misc.URLClassPath$JarLoader.getResource(URLClassPath.java:1005)
sun.misc.URLClassPath$JarLoader.findResource(URLClassPath.java:983)
sun.misc.URLClassPath$1.next(URLClassPath.java:240)
sun.misc.URLClassPath$1.hasMoreElements(URLClassPath.java:250)
java.net.URLClassLoader$3$1.run(URLClassLoader.java:601)
java.net.URLClassLoader$3$1.run(URLClassLoader.java:599)
java.security.AccessController.doPrivileged(Native Method)
java.net.URLClassLoader$3.next(URLClassLoader.java:598)
java.net.URLClassLoader$3.hasMoreElements(URLClassLoader.java:623)
sun.misc.CompoundEnumeration.next(CompoundEnumeration.java:45)
sun.misc.CompoundEnumeration.hasMoreElements(CompoundEnumeration.java:54)
java.util.ServiceLoader$LazyIterator.hasNextService(ServiceLoader.java:354)
java.util.ServiceLoader$LazyIterator.hasNext(ServiceLoader.java:393)
java.util.ServiceLoader$1.hasNext(ServiceLoader.java:474)
javax.json.spi.JsonProvider.provider(JsonProvider.java:89)
javax.json.Json.createReader(Json.java:208)
<.....>.processElement(<filename>.java:174)
Json.createReader , JsonProvider. , JAR. !
JsonReaderFactory -. , JsonReaderFactory , Dataflows startBundle. , , - . . , , . , , , !
GroupByKey, GroupByKey <_ PCollection , . - Count, Top, Combine .. - GroupByKey . "" , (, GCE ), , .
DoFn, a GroupByKey, KV<KeyType, Iterable<ValueType>>. , ( ) Iterable. , , , . - , . ,
p.apply(Read.from(new UserWebEventSource())
.apply(new ExtractBrowserString())
.apply(Window.<Event>into(FixedWindow.of(1, Duration.standardSeconds(1))))
.apply(GroupByKey.<String, Event>create())
.apply(ParDo.of(new ProcessEventsByBrowser()));
- , . ( Chrome, IE, Firefox, Safari), - , CPU. , . , , . Youve , Dataflow , API.
- DoFn ParDo ParDo. Dataflow GroupByKey, parallelism ( , , , ); , , , .
, , , ; . (Combine.globally(), Count.globally(), Top.largest(), .). , . , parallelism, Combine.PerKey.withHotKeyFanout Combine.Globally.withFanout. . , "" , , .
Dataflow . , . PCollection, ( GroupByKey) . , , Dataflow . , , sessions. .
, , Dataflow , . - , 24- - , , . (, ) - ( ). . :
pcollection.apply(Window.into(FixedWindows.of(1, TimeUnit.DAYS)))
.apply(GroupByKey.<KeyType, ValueType>create())
.apply(ParDo.of(new DoFn<KV<KeyType, Iterable<ValueType>>, Long>() {
public void processElement(ProcessContext c) {
c.output(c.element().size());
}
}));
... ...
pcollection.apply(Window.into(FixedWindows.of(1, TimeUnit.DAYS)))
.apply(Count.perKey());
... . Dataflow , , , , . , Dataflow , , !
, triggers API. , , .
, . , , , .
, . . , , , - , , , , . . , Dataflowing!