zoukankan      html  css  js  c++  java
  • spark编写word count

    创建SparkContext对象的时候需要传递SparkConf对象,SparkConf至少需要包含spark.master和spark.app.name这两个参数,不然的话程序不能正常运行

    object WordCount {
    
      def main(args: Array[String]) {
    
        val conf = new SparkConf();
        // 设置应用的名称
        conf.setAppName("WC")
        // 设置master, local代表本地模式,可以直接在IDE中运行,也可以指定local[k],local[*]
        conf.setMaster("local")
        // spark集群模式,需要打成jar包,提交到spark集群运行
        // conf.setMaster("spark://m1:7077")
    
        // 设置executor可以使用的内存大小
        conf.set("spark.executor.memory", "512m")
    
        val sc = new SparkContext(conf)
    
        sc.textFile("hdfs://m1:9000/words.txt").flatMap(_.split(" ")).map((_, 1))
          .reduceByKey(_+_).saveAsTextFile("hdfs://m1:9000/wcOutPut/")
        sc.stop()
      }
    
    }
    

    maven pom.xml如下

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
        <modelVersion>4.0.0</modelVersion>
        <groupId>wordcount</groupId>
        <artifactId>wordcount</artifactId>
        <version>1.0-SNAPSHOT</version>
        <inceptionYear>2008</inceptionYear>
        <!-- 定义属性 -->
    
        <properties>
            <maven.compiler.source>1.7</maven.compiler.source>
            <maven.compiler.target>1.7</maven.compiler.target>
            <encoding>UTF-8</encoding>
            <scala.version>2.10.6</scala.version>
            <scala.compat.version>2.10</scala.compat.version>
        </properties>
    
        <!-- 引用依赖 -->
        <dependencies>
            <dependency>
                <groupId>org.scala-lang</groupId>
                <artifactId>scala-library</artifactId>
                <version>${scala.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-core_2.10</artifactId>
                <version>1.6.3</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-streaming_2.10</artifactId>
                <version>1.6.3</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>2.6.5</version>
            </dependency>
        </dependencies>
    
        <!-- 构建 -->
        <build>
            <sourceDirectory>src/main/scala</sourceDirectory>
            <testSourceDirectory>src/test/scala</testSourceDirectory>
            <plugins>
                <!-- maven管理scala插件-->
                <plugin>
                    <groupId>net.alchim31.maven</groupId>
                    <artifactId>scala-maven-plugin</artifactId>
                    <version>3.2.0</version>
                    <executions>
                        <execution>
                            <goals>
                                <goal>compile</goal>
                                <goal>testCompile</goal>
                            </goals>
                            <configuration>
                                <args>
                                    <arg>-make:transitive</arg>
                                    <arg>-dependencyfile</arg>
                                    <arg>${project.build.directory}/.scala_dependencies</arg>
                                </args>
                            </configuration>
                        </execution>
                    </executions>
                </plugin>
                <!-- 在maven构建生命周期的test phase执行一个应用的单元测试 -->
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-surefire-plugin</artifactId>
                    <version>2.18.1</version>
                    <configuration>
                        <useFile>false</useFile>
                        <disableXmlReport>true</disableXmlReport>
                        <includes>
                            <include>**/*Test.*</include>
                            <include>**/*Suite.*</include>
                        </includes>
                    </configuration>
                </plugin>
    
                <!-- 使用maven插件对java工程进行打包 -->
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-shade-plugin</artifactId>
                    <version>2.3</version>
                    <executions>
                        <execution>
                            <phase>package</phase>
                            <goals>
                                <goal>shade</goal>
                            </goals>
                            <configuration>
                                <filters>
                                    <filter>
                                        <artifact>*:*</artifact>
                                        <excludes>
                                            <exclude>META-INF/*.SF</exclude>
                                            <exclude>META-INF/*.DSA</exclude>
                                            <exclude>META-INF/*.RSA</exclude>
                                        </excludes>
                                    </filter>
                                </filters>
                                <transformers>
                                    <transformer
                                            implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                        <mainClass>cn.itcast.spark.WordCount</mainClass>
                                    </transformer>
                                </transformers>
                            </configuration>
                        </execution>
                    </executions>
                </plugin>
            </plugins>
        </build>
    </project>
    

    打包提交spark集群运行

    bin/spark-submit 
    --class wordcount.WordCount 
    --master spark://m1:7077 
    --executor-memory 512M 
    --total-executor-cores 2 
    /home/hadoop/wordcount-1.0-SNAPSHOT.jar  
    

    本地运行如果hdfs权限有问题,则可以按如下配置

     

      

  • 相关阅读:
    201871010119-帖佼佼 实验三 结对项目—《D{0-1}KP 实例数据集算法实验平台》项目报告
    201871010119 帖佼佼 实验二 个人项目—《背包问题》 项目报告
    201871010119-帖佼佼 实验一 软件工程准备—初识软件工程
    201871010119-帖佼佼《面向对象程序设计(java)》课程学习总结
    201871010119-帖佼佼《面向对象程序设计(java)》第十七周学习总结
    201871010119-帖佼佼《面向对象程序设计(java)》第十六周学习总结
    201871010119-帖佼佼《面向对象程序设计(java)》第十五周学习总结
    201871010119-帖佼佼《面向对象程序设计(java)》第十四周学习总结
    201871010119-帖佼佼《面向对象程序设计(java)》第十三周学习总结
    201871010121-王方 实验四 团队作业1:软件研发团队组建
  • 原文地址:https://www.cnblogs.com/heml/p/6144801.html
Copyright © 2011-2022 走看看