Apache Flink - แนวคิด API

Flink มีชุด API ที่สมบูรณ์ซึ่งนักพัฒนาสามารถทำการเปลี่ยนแปลงได้ทั้งข้อมูลแบทช์และแบบเรียลไทม์ การเปลี่ยนแปลงที่หลากหลายรวมถึงการทำแผนที่การกรองการเรียงลำดับการเข้าร่วมการจัดกลุ่มและการรวมกลุ่ม การแปลงเหล่านี้โดย Apache Flink ดำเนินการกับข้อมูลแบบกระจาย ให้เราพูดคุยเกี่ยวกับข้อเสนอ APIs Apache Flink ที่แตกต่างกัน

API ชุดข้อมูล

Dataset API ใน Apache Flink ใช้เพื่อดำเนินการกับข้อมูลเป็นกลุ่มในช่วงเวลาหนึ่ง API นี้สามารถใช้ได้ใน Java, Scala และ Python สามารถใช้การเปลี่ยนแปลงประเภทต่างๆในชุดข้อมูลเช่นการกรองการทำแผนที่การรวมการรวมและการจัดกลุ่ม

ชุดข้อมูลถูกสร้างขึ้นจากแหล่งที่มาเช่นไฟล์ในเครื่องหรือโดยการอ่านไฟล์จากแหล่งที่มาเฉพาะและข้อมูลผลลัพธ์สามารถเขียนบนซิงก์ต่างๆเช่นไฟล์แบบกระจายหรือเทอร์มินัลบรรทัดคำสั่ง API นี้รองรับทั้งภาษาโปรแกรม Java และ Scala

นี่คือโปรแกรม Wordcount ของ Dataset API -

public class WordCountProg {
   public static void main(String[] args) throws Exception {
      final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

      DataSet<String> text = env.fromElements(
      "Hello",
      "My Dataset API Flink Program");

      DataSet<Tuple2<String, Integer>> wordCounts = text
      .flatMap(new LineSplitter())
      .groupBy(0)
      .sum(1);

      wordCounts.print();
   }

   public static class LineSplitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
      @Override
      public void flatMap(String line, Collector<Tuple2<String, Integer>> out) {
         for (String word : line.split(" ")) {
            out.collect(new Tuple2<String, Integer>(word, 1));
         }
      }
   }
}

DataStream API

API นี้ใช้สำหรับจัดการข้อมูลในสตรีมแบบต่อเนื่อง คุณสามารถดำเนินการต่างๆเช่นการกรองการทำแผนที่การกำหนดหน้าต่างการรวมข้อมูลสตรีม มีแหล่งข้อมูลต่างๆในสตรีมข้อมูลนี้เช่นคิวข้อความไฟล์ซ็อกเก็ตสตรีมและข้อมูลผลลัพธ์สามารถเขียนบนซิงก์ต่างๆเช่นเทอร์มินัลบรรทัดคำสั่ง ทั้งภาษาโปรแกรม Java และ Scala รองรับ API นี้

นี่คือโปรแกรม Wordcount แบบสตรีมของ DataStream API ซึ่งคุณมีสตรีมจำนวนคำอย่างต่อเนื่องและข้อมูลจะถูกจัดกลุ่มในหน้าต่างที่สอง

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class WindowWordCountProg {
   public static void main(String[] args) throws Exception {
      StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
      DataStream<Tuple2<String, Integer>> dataStream = env
      .socketTextStream("localhost", 9999)
      .flatMap(new Splitter())
      .keyBy(0)
      .timeWindow(Time.seconds(5))
      .sum(1);
      dataStream.print();
      env.execute("Streaming WordCount Example");
   }
   public static class Splitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
      @Override
      public void flatMap(String sentence, Collector<Tuple2<String, Integer>> out) throws Exception {
         for (String word: sentence.split(" ")) {
            out.collect(new Tuple2<String, Integer>(word, 1));
         }
      }
   }
}