zoukankan      html  css  js  c++  java
  • flink 1.11.2 学习笔记(1)-wordCount

    一、pom依赖

      1 <?xml version="1.0" encoding="UTF-8"?>
      2 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
      3          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
      4     <modelVersion>4.0.0</modelVersion>
      5 
      6     <groupId>com.cnblogs.yjmyzz</groupId>
      7     <artifactId>spring-boot-demo</artifactId>
      8     <version>0.0.1-SNAPSHOT</version>
      9 
     10     <properties>
     11         <java.version>1.8</java.version>
     12         <flink.version>1.11.2</flink.version>
     13     </properties>
     14 
     15     <dependencies>
     16 
     17         <!--lombok可选-->
     18         <dependency>
     19             <groupId>org.projectlombok</groupId>
     20             <artifactId>lombok</artifactId>
     21             <version>1.18.12</version>
     22         </dependency>
     23 
     24         <!-- spring应用最小依赖-可选 -->
     25         <dependency>
     26             <groupId>org.springframework</groupId>
     27             <artifactId>spring-context</artifactId>
     28             <version>5.2.4.RELEASE</version>
     29         </dependency>
     30 
     31         <!-- flink -->
     32         <dependency>
     33             <groupId>org.apache.flink</groupId>
     34             <artifactId>flink-core</artifactId>
     35             <version>${flink.version}</version>
     36         </dependency>
     37 
     38         <dependency>
     39             <groupId>org.apache.flink</groupId>
     40             <artifactId>flink-java</artifactId>
     41             <version>${flink.version}</version>
     42         </dependency>
     43 
     44         <dependency>
     45             <groupId>org.apache.flink</groupId>
     46             <artifactId>flink-scala_2.12</artifactId>
     47             <version>${flink.version}</version>
     48         </dependency>
     49 
     50         <dependency>
     51             <groupId>org.apache.flink</groupId>
     52             <artifactId>flink-clients_2.12</artifactId>
     53             <version>${flink.version}</version>
     54         </dependency>
     55 
     56         <dependency>
     57             <groupId>org.apache.flink</groupId>
     58             <artifactId>flink-test-utils-junit</artifactId>
     59             <version>${flink.version}</version>
     60         </dependency>
     61     </dependencies>
     62 
     63     <build>
     64         <plugins>
     65             <plugin>
     66                 <artifactId>maven-compiler-plugin</artifactId>
     67                 <version>3.1</version>
     68                 <configuration>
     69                     <source>1.8</source>
     70                     <target>1.8</target>
     71                 </configuration>
     72             </plugin>
     73 
     74             <!-- Scala Compiler -->
     75             <plugin>
     76                 <groupId>net.alchim31.maven</groupId>
     77                 <artifactId>scala-maven-plugin</artifactId>
     78                 <version>4.4.0</version>
     79                 <executions>
     80                     <!-- Run scala compiler in the process-resources phase, so that dependencies on
     81                         scala classes can be resolved later in the (Java) compile phase -->
     82                     <execution>
     83                         <id>scala-compile-first</id>
     84                         <phase>process-resources</phase>
     85                         <goals>
     86                             <goal>compile</goal>
     87                         </goals>
     88                     </execution>
     89                 </executions>
     90                 <configuration>
     91                     <jvmArgs>
     92                         <jvmArg>-Xms128m</jvmArg>
     93                         <jvmArg>-Xmx512m</jvmArg>
     94                     </jvmArgs>
     95                 </configuration>
     96             </plugin>
     97 
     98         </plugins>
     99     </build>
    100 
    101 </project>
    View Code

    二、WordCount(批处理版本)

     1 import org.apache.flink.api.common.functions.FlatMapFunction;
     2 import org.apache.flink.api.java.DataSet;
     3 import org.apache.flink.api.java.ExecutionEnvironment;
     4 import org.apache.flink.api.java.tuple.Tuple2;
     5 import org.apache.flink.util.Collector;
     6 
     7 /**
     8  * WordCount批处理版本
     9  */
    10 public class BatchWordCount {
    11 
    12     public static void main(String[] args) throws Exception {
    13 
    14         // 1 设置环境
    15         final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    16 
    17         // 2. 定义数据
    18         String wordFilePath = "/Users/jimmy/Downloads/word.txt";
    19         DataSet<String> text = env.readTextFile(wordFilePath);
    20 
    21         // 3. 处理逻辑
    22         DataSet<Tuple2<String, Integer>> counts =
    23 
    24                 text.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
    25                     @Override
    26                     public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
    27                         //将每行按word拆分
    28                         String[] tokens = value.toLowerCase().split("\W+");
    29 
    30                         //收集(类似:map-reduce思路)
    31                         for (String token : tokens) {
    32                             if (token.length() > 0) {
    33                                 out.collect(new Tuple2<>(token, 1));
    34                             }
    35                         }
    36                     }
    37                 })
    38                         //按Tuple2里的第0项,即:word分组
    39                         .groupBy(0)
    40                         //然后对Tuple2里的第1项求合
    41                         .sum(1);
    42 
    43         // 4. 打印结果
    44         counts.print();
    45 
    46     }
    47 }
    View Code

    注:数据文件/Users/jimmy/Downloads/word.txt的位置,大家可根据实际情况调整,该文件的内容类似:

    hello java
    hello flink
    

      

    三、WordCount(流式处理版本)

     1 import org.apache.flink.api.common.functions.FlatMapFunction;
     2 import org.apache.flink.api.java.tuple.Tuple2;
     3 import org.apache.flink.streaming.api.datastream.DataStream;
     4 import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
     5 import org.apache.flink.util.Collector;
     6 
     7 public class StreamWordCount {
     8 
     9     public static void main(String[] args) throws Exception {
    10         // 1 设置环境
    11         final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    12 
    13         // 2. 定义数据
    14         DataStream<String> text = env.socketTextStream("127.0.0.1", 9999);
    15 
    16         // 3. 处理逻辑
    17         DataStream<Tuple2<String, Integer>> counts =
    18 
    19                 text.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
    20                     @Override
    21                     public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
    22                         //将每行按word拆分
    23                         String[] tokens = value.toLowerCase().split("\b+");
    24 
    25                         //收集(类似:map-reduce思路)
    26                         for (String token : tokens) {
    27                             if (token.trim().length() > 0) {
    28                                 out.collect(new Tuple2<>(token.trim(), 1));
    29                             }
    30                         }
    31                     }
    32                 })
    33                         //按Tuple2里的第0项,即:word分组
    34                         .keyBy(value -> value.f0)
    35                         //然后对Tuple2里的第1项求合
    36                         .sum(1);
    37 
    38         // 4. 打印结果
    39         counts.print();
    40 
    41         // execute program
    42         env.execute("Streaming WordCount");
    43 
    44     }
    45 }
    View Code

    注:运行前,先在终端命令行输入nc -l 9999,开启一个tcp server作为流式处理的数据源,然后再运行上面的代码,然后命令行输入一些文本,即可看到输出

     

  • 相关阅读:
    AI.框架理论.语义网.语言间距.孤单
    图像局部显著性—点特征(Fast)
    图像的全局特征--HOG特征、DPM特征
    图像局部显著性—点特征(FREAK)
    iis7服务器隐藏index.php
    php命名空间
    thinkphp5.0 composer安装phpmailer
    php输入流简单小例子
    js、php判断手机PC
    thinkphp5.0 空模块、空控制器、空方法
  • 原文地址:https://www.cnblogs.com/yjmyzz/p/flink-wordcount-demo.html
Copyright © 2011-2022 走看看