zoukankan      html  css  js  c++  java
  • spark sql在scala与java中的代码实现

    在编写spark sql代码前,需要新建maven工程,将hadoop下的配置文件core-site.xml和hdfs-site.xml,以及hive中的hive-site.xml拷贝到工程的resource目录下,并在pom.xml中配置jar包信息。

    pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>begin</groupId>
    <artifactId>myspark</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
    <spark.version>2.4.3</spark.version>
    <scala.version>2.11.12</scala.version>
    </properties>
    <dependencies>
    <dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.11</artifactId>
    <version>2.4.3</version>
    </dependency>

    <dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-library</artifactId>
    <version>2.11.12</version>
    </dependency>

    <dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-reflect</artifactId>
    <version>2.11.12</version>
    </dependency>

    <dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-compiler</artifactId>
    <version>2.11.12</version>
    </dependency>
    <dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.28</version>
    </dependency>


    <dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.47</version>
    </dependency>
    <dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.12</version>
    </dependency>

    <dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql_2.11</artifactId>
    <version>2.4.3</version>
    </dependency>

    <dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-hive_2.11</artifactId>
    <version>2.4.3</version>
    </dependency>

    </dependencies>

    </project>
    scala实现

    import org.apache.spark.sql.SparkSession

    /**
    * 使用scala实现spark sql访问
    */
    object SparkSqlDemoScala {
    def main(args: Array[String]):Unit= {
    val spark=SparkSession.builder().appName("SparkSql").master("local[*]").enableHiveSupport().getOrCreate()
    val rdd1=spark.sparkContext.textFile("/user/hadoop/data2/wc.txt")
    val rdd2=rdd1.flatMap(_.split(" "))
    //导入sparksession的隐式转换
    import spark.implicits._
    //将rdd转换成数据框
    val df=rdd2.toDF("word")
    //将数据框注册成临时视图
    df.createOrReplaceTempView("_doc")
    spark.sql("select word,count(*) from _doc group by word").show(1000,false)
    }
    }
    JAVA实现:

    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.FlatMapFunction;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.sql.Dataset;
    import org.apache.spark.sql.Row;
    import org.apache.spark.sql.RowFactory;
    import org.apache.spark.sql.SparkSession;
    import org.apache.spark.sql.types.DataTypes;
    import org.apache.spark.sql.types.Metadata;
    import org.apache.spark.sql.types.StructField;
    import org.apache.spark.sql.types.StructType;

    import java.util.Arrays;
    import java.util.Iterator;

    /**
    * 使用java实现spark sql访问
    */
    public class SparkSQLDemoJava {
    public static void main(String[] args) {
    SparkSession spark= SparkSession.builder().appName("sparkSQL").master("local").enableHiveSupport().getOrCreate();
    //创建javaSpark上下文
    JavaSparkContext sc=new JavaSparkContext(spark.sparkContext());
    //加载文件
    JavaRDD<String> rdd1=sc.textFile("/user/hadoop/data2/wc.txt");
    JavaRDD<String> rdd2=rdd1.flatMap(new FlatMapFunction<String,String>(){
    public Iterator<String> call(String s) throws Exception{
    return Arrays.asList(s.split(" ")).iterator();
    }
    });
    //将string 变换成 row
    JavaRDD<Row> rdd3=rdd2.map(new Function<String,Row>(){
    public Row call(String word) throws Exception{
    return RowFactory.create(word);
    }
    });
    //构造表结构
    StructField[] fields=new StructField[1];
    fields[0]=new StructField("word", DataTypes.StringType,true, Metadata.empty());
    //表结构类型
    StructType type=new StructType(fields);
    //将RDD转换成DataFrame
    Dataset<Row> df=spark.createDataFrame(rdd3,type);
    //注册临时视图
    df.createOrReplaceTempView("_doc");

    spark.sql("select word,count(*) from _doc group by word").show(1000,false);

    }

    ————————————————
    版权声明:本文为CSDN博主「赵厚雄」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
    原文链接:https://blog.csdn.net/nengyu/article/details/95870479

  • 相关阅读:
    总结
    PHP的重载-使用魔术方法实现
    用PHP实现一些常见的排序算法
    MySQL分组聚合group_concat + substr_index
    各种链接地址
    在Linux服务器上使用rz命令上传文件时时老报:Segmentation Fault,上传失败
    新安装的windows 10无法更新报0x80240fff错误的解决方案
    通过SSH key获取GitHub上项目,导入到IDEA中
    解压.zip,.tar.gz文件到指定目录,重命名文件
    byte字节数组的压缩
  • 原文地址:https://www.cnblogs.com/javalinux/p/15069270.html
Copyright © 2011-2022 走看看