创建SparkContext对象的时候需要传递SparkConf对象,SparkConf至少需要包含spark.master和spark.app.name这两个参数,不然的话程序不能正常运行
object WordCount { def main(args: Array[String]) { val conf = new SparkConf(); // 设置应用的名称 conf.setAppName("WC") // 设置master, local代表本地模式,可以直接在IDE中运行,也可以指定local[k],local[*] conf.setMaster("local") // spark集群模式,需要打成jar包,提交到spark集群运行 // conf.setMaster("spark://m1:7077") // 设置executor可以使用的内存大小 conf.set("spark.executor.memory", "512m") val sc = new SparkContext(conf) sc.textFile("hdfs://m1:9000/words.txt").flatMap(_.split(" ")).map((_, 1)) .reduceByKey(_+_).saveAsTextFile("hdfs://m1:9000/wcOutPut/") sc.stop() } }
maven pom.xml如下
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>wordcount</groupId> <artifactId>wordcount</artifactId> <version>1.0-SNAPSHOT</version> <inceptionYear>2008</inceptionYear> <!-- 定义属性 --> <properties> <maven.compiler.source>1.7</maven.compiler.source> <maven.compiler.target>1.7</maven.compiler.target> <encoding>UTF-8</encoding> <scala.version>2.10.6</scala.version> <scala.compat.version>2.10</scala.compat.version> </properties> <!-- 引用依赖 --> <dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.6.3</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.6.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.5</version> </dependency> </dependencies> <!-- 构建 --> <build> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <plugins> <!-- maven管理scala插件--> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.0</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> <configuration> <args> <arg>-make:transitive</arg> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </execution> </executions> </plugin> <!-- 在maven构建生命周期的test phase执行一个应用的单元测试 --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> <version>2.18.1</version> <configuration> <useFile>false</useFile> <disableXmlReport>true</disableXmlReport> <includes> <include>**/*Test.*</include> <include>**/*Suite.*</include> </includes> </configuration> </plugin> <!-- 使用maven插件对java工程进行打包 --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>cn.itcast.spark.WordCount</mainClass> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>
打包提交spark集群运行
bin/spark-submit --class wordcount.WordCount --master spark://m1:7077 --executor-memory 512M --total-executor-cores 2 /home/hadoop/wordcount-1.0-SNAPSHOT.jar
本地运行如果hdfs权限有问题,则可以按如下配置