zoukankan      html  css  js  c++  java
  • spark RDD 多流程练习操作

    name_phone.txt
    
    
    bob 15700079421
    amy 18700079458
    alice 17730079427
    tom 16700379451
    lulu 18800074423
    nick 14400033426
    
    
    name_addr.txt
    
    bob shanghai#200000
    amy beijing#100000
    alice shanghai#200000
    tom beijing#100000
    lulu hangzhou#310000
    nick shanghai#200000

    当然原文件和目标文件都在hdfs中

    代码

    package sparkstreaming_action.rdd.operation
    
    import org.apache.spark.SparkConf
    import org.apache.spark.SparkContext
    
    object RDDOperation extends App {
      val conf = new SparkConf()
        .setAppName("rddOperation")
        .setMaster("spark://192.168.199.120:7077")          //主服务器  UI上显示 主机:port  会将excute 分配个其他节点
      // Create spark context
      val sc = new SparkContext(conf)
      val txtNameAddr = "name_addr.txt"
      val txtNamePhone = "name_phone.txt"
      val rddNameAddr = sc.textFile(txtNameAddr).map(record => {
        val tokens = record.split(" ")
        (tokens(0), tokens(1))
      }) // rdd1
      rddNameAddr.cache
      val rddNamePhone = sc.textFile(txtNamePhone).map(record => {
        val tokens = record.split(" ")
        (tokens(0), tokens(1))
      }) // rdd2
      rddNamePhone.cache
      val rddNameAddrPhone = rddNameAddr.join(rddNamePhone) // rdd3     不需要on  默认为  (a,1)join (a,'y')   (a,1,'y')
      val rddHtml = rddNameAddrPhone.map(record => {
        val name = record._1
        val addr = record._2._1
        val phone = record._2._2
        s"<h2>姓名:${name}</h2><p>地址:${addr}</p><p>电话:${phone}</p>"
      }) // rdd4
      val rddOutput = rddHtml.saveAsTextFile("UserInfo")
    
      val rddPostcode = rddNameAddr.map(record => {
        val postcode = record._2.split("#")(1)
        (postcode, 1)
      }) // rdd5
      val rddPostcodeCount = rddPostcode.reduceByKey(_ + _) // rdd6
      rddPostcodeCount.collect().foreach(println)
      sc.stop
    }

    pom文件为

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.spark_smart</groupId>
        <artifactId>smart</artifactId>
        <version>1.0-SNAPSHOT</version>
    
                <dependencies>
                    <dependency>
                        <groupId>org.apache.spark</groupId>
                        <artifactId>spark-core_2.11</artifactId>
                        <version>2.0.0</version>
                    </dependency>
                    <dependency><!-- Log -->
                        <groupId>log4j</groupId>
                        <artifactId>log4j</artifactId>
                        <version>1.2.17</version>
                    </dependency>
                    <dependency>
                        <groupId>org.slf4j</groupId>
                        <artifactId>slf4j-log4j12</artifactId>
                        <version>1.7.12</version>
                    </dependency>
                </dependencies>
    
    
    <!--    <build>-->
    <!--        <plugins>-->
    <!--            &lt;!&ndash; mixed scala/java compile &ndash;&gt;-->
    <!--            <plugin>-->
    <!--                <groupId>org.scala-tools</groupId>-->
    <!--                <artifactId>maven-scala-plugin</artifactId>-->
    <!--                <executions>-->
    <!--                    <execution>-->
    <!--                        <id>compile</id>-->
    <!--                        <goals>-->
    <!--                            <goal>compile</goal>-->
    <!--                        </goals>-->
    <!--                        <phase>compile</phase>-->
    <!--                    </execution>-->
    <!--                    <execution>-->
    <!--                        <id>test-compile</id>-->
    <!--                        <goals>-->
    <!--                            <goal>testCompile</goal>-->
    <!--                        </goals>-->
    <!--                        <phase>test-compile</phase>-->
    <!--                    </execution>-->
    <!--                    <execution>-->
    <!--                        <phase>process-resources</phase>-->
    <!--                        <goals>-->
    <!--                            <goal>compile</goal>-->
    <!--                        </goals>-->
    <!--                    </execution>-->
    <!--                </executions>-->
    <!--            </plugin>-->
    <!--            <plugin>-->
    <!--                <artifactId>maven-compiler-plugin</artifactId>-->
    <!--                <configuration>-->
    <!--                    <source>1.7</source>-->
    <!--                    <target>1.7</target>-->
    <!--                </configuration>-->
    <!--            </plugin>-->
    <!--            &lt;!&ndash; for fatjar &ndash;&gt;-->
    <!--            <plugin>-->
    <!--                <groupId>org.apache.maven.plugins</groupId>-->
    <!--                <artifactId>maven-assembly-plugin</artifactId>-->
    <!--                <version>2.2</version>-->
    <!--                <configuration>-->
    <!--                    <descriptorRefs>-->
    <!--                        <descriptorRef>jar-with-dependencies</descriptorRef>-->
    <!--                    </descriptorRefs>-->
    <!--                </configuration>-->
    <!--                <executions>-->
    <!--                    <execution>-->
    <!--                        <id>assemble-all</id>-->
    <!--                        <phase>package</phase>-->
    <!--                        <goals>-->
    <!--                            <goal>single</goal>-->
    <!--                        </goals>-->
    <!--                    </execution>-->
    <!--                </executions>-->
    <!--            </plugin>-->
    <!--            <plugin>-->
    <!--                <groupId>org.apache.maven.plugins</groupId>-->
    <!--                <artifactId>maven-jar-plugin</artifactId>-->
    <!--                <configuration>-->
    <!--                    <archive>-->
    <!--                        <manifest>-->
    <!--                            <addClasspath>true</addClasspath>-->
    <!--                            <mainClass>com.sparkstreaming.action.main.WordFreq</mainClass>-->
    <!--                        </manifest>-->
    <!--                    </archive>-->
    <!--                </configuration>-->
    <!--            </plugin>-->
    <!--        </plugins>-->
    <!--    </build>-->
    <!--    <repositories>-->
    <!--        <repository>-->
    <!--            <id>aliyunmaven</id>-->
    <!--            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>-->
    <!--        </repository>-->
    <!--    </repositories>-->
    
    </project>
    提交代码
    ./bin/spark-submit  --class sparkstreaming_action.rdd.operation.RDDOperation --num-executors 4    --driver-memory 512M --executor-memory 512M --executor-cores 1 --conf spark.default.parallelism=1000 /root/spark/spark/smart.jar

    结果

    [root@min01 spark]# hdfs dfs -cat  /user/root/UserInfo/*
    19/07/14 19:44:17 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    <h2>姓名:tom</h2><p>地址:beijing#100000</p><p>电话:16700379451</p>
    <h2>姓名:alice</h2><p>地址:shanghai#200000</p><p>电话:17730079427</p>
    <h2>姓名:nick</h2><p>地址:shanghai#200000</p><p>电话:14400033426</p>
    <h2>姓名:lulu</h2><p>地址:hangzhou#310000</p><p>电话:18800074423</p>
    <h2>姓名:amy</h2><p>地址:beijing#100000</p><p>电话:18700079458</p>
    <h2>姓名:bob</h2><p>地址:shanghai#200000</p><p>电话:15700079421</p>
    RUSH B
  • 相关阅读:
    Codeforces Round #536 E. Lunar New Year and Red Envelopes /// 贪心 记忆化搜索 multiset取最大项
    牛客网暑期ACM多校训练营(第五场) F
    关于线段树or 树状树状 在二维平面搞事情!Orz
    ZOJ 3822 ( 2014牡丹江区域赛D题) (概率dp)
    HDU4336 Card Collector (概率dp+状压dp)
    POJ 2151 Check the difficulty of problems(概率DP)
    CF E2
    HDU4089 Activation(概率DP+处理环迭代式子)
    HDU4035 Maze (概率DP)
    牛客 Rabbit的数列 (线段树维护值为x的个数+区间覆盖)
  • 原文地址:https://www.cnblogs.com/tangsonghuai/p/11185449.html
Copyright © 2011-2022 走看看