zoukankan      html  css  js  c++  java
  • 从搭建大数据环境说起,到执行WordCount所遇到的坑

    从搭建大数据环境说起,到执行WordCount所遇到的坑

    背景说明

    最近(2020年12月20日)在了解大数据相关架构及技术体系。

    虽然说只是了解,不需要亲自动手去搭建一个环境并执行相应的job

    但是,技术嘛。就是要靠下笨功夫,一点点的积累。该动手的还是不能少。

    所以,就从搭环境(基于docker)开始,一直到成功执行了一个基于yarn调度的wordcountjob

    期间,遇到了不少坑点,一个一个填好,大概花了10个小时左右的时间。

    希望能将这种血泪教训,分享给需要的人。花更少的时间,去完成整个流程。

    注意:个人本地环境为macOS Big Sur

    基于docker compose的大数据环境搭建

    参考 docker-hadoop-spark-hive 快速构建你的大数据环境 搭建了一个大数据环境,调整了部分参数,以适用于mac os

    主要是如下五个文件:

    .
    ├── copy-jar.sh # spark yarn支持
    ├── docker-compose.yml # docker compose文件
    ├── hadoop-hive.env # 环境变量配置
    ├── run.sh # 启动脚本
    └── stop.sh # 停止脚本
    

    注意:mac osdocker有一个坑点就是无法直接在宿主机访问容器,我使用Docker for Mac 的网络问题及解决办法(新增方法四)中的方法四解决的。

    注意:需要在宿主机配置好相应docker容器对应的ip,这才能保证job成功执行,且各个服务在宿主机访问的时候,跳转不会出现问题。这坑很深,慎踩

    # switch_local
    
    172.21.0.3 namenode
    172.21.0.8 resourcemanager
    172.21.0.9 nodemanager
    172.21.0.10 historyserver
    

    docker-compose.yml

    version: '2' 
    services:
      namenode:
        image: bde2020/hadoop-namenode:1.1.0-hadoop2.8-java8
        container_name: namenode
        volumes:
          - ~/data/namenode:/hadoop/dfs/name
        environment:
          - CLUSTER_NAME=test
        env_file:
          - ./hadoop-hive.env
        ports:
          - 50070:50070
          - 8020:8020
      resourcemanager:
        image: bde2020/hadoop-resourcemanager:1.1.0-hadoop2.8-java8
        container_name: resourcemanager
        environment:
          - CLUSTER_NAME=test
        env_file:
          - ./hadoop-hive.env
        ports:
          - 8088:8088
      historyserver:
        image: bde2020/hadoop-historyserver:1.1.0-hadoop2.8-java8
        container_name: historyserver
        environment:
          - CLUSTER_NAME=test
        env_file:
          - ./hadoop-hive.env
        ports:
          - 8188:8188
      datanode:
        image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
        depends_on: 
          - namenode
        volumes:
          - ~/data/datanode:/hadoop/dfs/data
        env_file:
          - ./hadoop-hive.env
        ports:
          - 50075:50075
      datanode2:
        image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
        depends_on: 
          - namenode
        volumes:
          - ~/data/datanode2:/hadoop/dfs/data
        env_file:
          - ./hadoop-hive.env
        ports:
          - 50076:50075
      datanode3:
        image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
        depends_on: 
          - namenode
        volumes:
          - ~/data/datanode3:/hadoop/dfs/data
        env_file:
          - ./hadoop-hive.env
        ports:
          - 50077:50075
      nodemanager:
        image: bde2020/hadoop-nodemanager:1.1.0-hadoop2.8-java8
        container_name: nodemanager
        hostname: nodemanager
        environment:
          - CLUSTER_NAME=test
        env_file:
          - ./hadoop-hive.env
        ports:
          - 8042:8042
      hive-server:
        image: bde2020/hive:2.1.0-postgresql-metastore
        container_name: hive-server
        env_file:
          - ./hadoop-hive.env
        environment:
          - "HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore/metastore"
        ports:
          - "10000:10000"
      hive-metastore:
        image: bde2020/hive:2.1.0-postgresql-metastore
        container_name: hive-metastore
        env_file:
          - ./hadoop-hive.env
        command: /opt/hive/bin/hive --service metastore
        ports:
          - 9083:9083
      hive-metastore-postgresql:
        image: bde2020/hive-metastore-postgresql:2.1.0
        ports:
          - 5432:5432
        volumes:
          - ~/data/postgresql/:/var/lib/postgresql/data
      spark-master:
        image: bde2020/spark-master:2.1.0-hadoop2.8-hive-java8
        container_name: spark-master
        hostname: spark-master
        volumes:
          - ./copy-jar.sh:/copy-jar.sh
        ports:
          - 18080:8080
          - 7077:7077
        env_file:
          - ./hadoop-hive.env
      spark-worker:
        image: bde2020/spark-worker:2.1.0-hadoop2.8-hive-java8
        depends_on:
          - spark-master
        environment:
          - SPARK_MASTER=spark://spark-master:7077
        ports:
          - "18081:8081"
        env_file:
          - ./hadoop-hive.env
    

    hadoop-hive.env

    HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
    HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
    HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
    HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
    HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
    HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
    HIVE_SITE_CONF_hive_metastore_warehouse_dir=hdfs://namenode:8020/user/hive/warehouse
    
    CORE_CONF_fs_defaultFS=hdfs://namenode:8020
    CORE_CONF_fs_default_name=hdfs://namenode:8020
    CORE_CONF_hadoop_http_staticuser_user=root
    CORE_CONF_hadoop_proxyuser_hue_hosts=*
    CORE_CONF_hadoop_proxyuser_hue_groups=*
    
    HDFS_CONF_dfs_webhdfs_enabled=true
    HDFS_CONF_dfs_permissions_enabled=false
    
    YARN_CONF_yarn_log___aggregation___enable=true
    YARN_CONF_yarn_resourcemanager_recovery_enabled=true
    YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
    YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
    YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
    YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
    YARN_CONF_yarn_timeline___service_enabled=true
    YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
    YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
    YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
    YARN_CONF_yarn_timeline___service_hostname=historyserver
    YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
    YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
    YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
    YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
    YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
    

    run.sh

    #!/bin/bash
    
    # 启动容器
    docker-compose -f docker-compose.yml up -d namenode hive-metastore-postgresql
    docker-compose -f docker-compose.yml up -d datanode datanode2 datanode3 hive-metastore
    docker-compose -f docker-compose.yml up -d resourcemanager
    docker-compose -f docker-compose.yml up -d nodemanager
    docker-compose -f docker-compose.yml up -d historyserver
    sleep 5
    docker-compose -f docker-compose.yml up -d hive-server
    docker-compose -f docker-compose.yml up -d spark-master spark-worker
    
    # 获取ip地址并打印到控制台
    my_ip=`ifconfig | grep 'inet.*netmask.*broadcast' |  awk '{print $2;exit}'`
    echo "Namenode: http://${my_ip}:50070"
    echo "Datanode: http://${my_ip}:50075"
    echo "Spark-master: http://${my_ip}:18080"
    
    # 执行脚本,spark yarn支持
    docker-compose exec spark-master bash -c "./copy-jar.sh && exit"
    

    copy-jar.sh

    #!/bin/bash
    
    cd /opt/hadoop-2.8.0/share/hadoop/yarn/lib/ && cp jersey-core-1.9.jar jersey-client-1.9.jar /spark/jars/ && rm -rf /spark/jars/jersey-client-2.22.2.jar
    

    stop.sh

    #!/bin/bash
    docker-compose stop
    

    基于IDEA提交MapReduceyarn

    参考列表

    1. IDEA向hadoop集群提交MapReduce作业
    2. java操作hadoop hdfs,实现文件上传下载demo
    3. IDEA远程提交mapreduce任务至linux,遇到ClassNotFoundException: Mapper

    注意:在提交至yarn的时候,要将代码打成jar包,否则会报错ClassNotFoundExeption。具体参考《IDEA远程提交mapreduce任务至linux,遇到ClassNotFoundException: Mapper》。

    pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.switchvov</groupId>
        <artifactId>hadoop-test</artifactId>
        <version>1.0.0</version>
    
        <name>hadoop-test</name>
    
        <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
            <maven.compiler.source>1.8</maven.compiler.source>
            <maven.compiler.target>1.8</maven.compiler.target>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
                <scope>test</scope>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>2.8.0</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>2.8.0</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>2.8.0</version>
            </dependency>
        </dependencies>
    </project>
    

    log4j.properties

    log4j.rootLogger=INFO, console
    log4j.appender.console=org.apache.log4j.ConsoleAppender
    log4j.appender.console.Target=System.out
    log4j.appender.console.layout=org.apache.log4j.PatternLayout
    log4j.appender.console.layout.ConversionPattern=[%p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%m%n
    

    words.txt

    this is a tests
    this is a tests
    this is a tests
    this is a tests
    this is a tests
    this is a tests
    this is a tests
    this is a tests
    this is a tests
    

    HdfsDemo.java

    package com.switchvov.hadoop.hdfs;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IOUtils;
    
    import java.io.InputStream;
    
    /**
     * @author switch
     * @since 2020/12/18
     */
    public class HdfsDemo {
        /**
         * hadoop fs的配置文件
         */
        private static final Configuration CONFIGURATION = new Configuration();
    
        static {
            // 指定hadoop fs的地址
            CONFIGURATION.set("fs.default.name", "hdfs://namenode:8020");
        }
    
        /**
         * 将本地文件(filePath)上传到HDFS服务器的指定路径(dst)
         */
        public static void uploadFileToHDFS(String filePath, String dst) throws Exception {
            // 创建一个文件系统
            FileSystem fs = FileSystem.get(CONFIGURATION);
            Path srcPath = new Path(filePath);
            Path dstPath = new Path(dst);
            long start = System.currentTimeMillis();
            fs.copyFromLocalFile(false, srcPath, dstPath);
            System.out.println("Time:" + (System.currentTimeMillis() - start));
            System.out.println("________准备上传文件" + CONFIGURATION.get("fs.default.name") + "____________");
            fs.close();
        }
    
        /**
         * 下载文件
         */
        public static void downLoadFileFromHDFS(String src) throws Exception {
            FileSystem fs = FileSystem.get(CONFIGURATION);
            Path srcPath = new Path(src);
            InputStream in = fs.open(srcPath);
            try {
                // 将文件COPY到标准输出(即控制台输出)
                IOUtils.copyBytes(in, System.out, 4096, false);
            } finally {
                IOUtils.closeStream(in);
                fs.close();
            }
        }
    
        public static void main(String[] args) throws Exception {
            String filename = "words.txt";
    //        uploadFileToHDFS(
    //                "/Users/switch/projects/OtherProjects/bigdata-enviroment/hadoop-test/data/" + filename,
    //                "/share/" + filename
    //        );
            downLoadFileFromHDFS("/share/output12/" + filename + "/part-r-00000");
        }
    }
    

    WordCountRunner.java

    package com.switchvov.hadoop.mapreduce.wordcount;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    
    
    /**
     * @author switch
     * @since 2020/12/17
     */
    public class WordCountRunner {
    
        /**
         * LongWritable 行号 类型
         * Text 输入的value 类型
         * Text 输出的key 类型
         * IntWritable 输出的value 类型
         *
         * @author switch
         * @since 2020/12/17
         */
        public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
            /**
             * @param key     行号
             * @param value   第一行的内容 如  this is a tests
             * @param context 输出
             * @throws IOException          异常
             * @throws InterruptedException 异常
             */
            @Override
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String line = value.toString();
                // 以空格分割获取字符串数组
                String[] words = line.split(" ");
                for (String word : words) {
                    context.write(new Text(word), new IntWritable(1));
                }
            }
        }
    
        /**
         * Text 输入的key的类型
         * IntWritable 输入的value的类型
         * Text 输出的key类型
         * IntWritable 输出的value类型
         *
         * @author switch
         * @since 2020/12/17
         */
        public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
            /**
             * @param key     输入map的key
             * @param values  输入map的value
             * @param context 输出
             * @throws IOException          异常
             * @throws InterruptedException 异常
             */
            @Override
            protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
                int count = 0;
                for (IntWritable value : values) {
                    count += value.get();
                }
                context.write(key, new IntWritable(count));
            }
        }
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            // 跨平台,保证在 Windows 下可以提交 mr job
            conf.set("mapreduce.app-submission.cross-platform", "true");
            // 配置yarn调度
            conf.set("mapreduce.framework.name", "yarn");
            // 配置resourcemanager的主机名
            conf.set("yarn.resourcemanager.hostname", "resourcemanager");
            // 配置默认了namenode访问地址
            conf.set("fs.defaultFS", "hdfs://namenode:8020");
            conf.set("fs.default.name", "hdfs://namenode:8020");
            // 配置代码jar包,否则会出现ClassNotFound异常,参考:https://blog.csdn.net/qq_19648191/article/details/56684268
            conf.set("mapred.jar", "/Users/switch/projects/OtherProjects/bigdata-enviroment/hadoop-test/out/artifacts/hadoop/hadoop.jar");
            // 任务名
            Job job = Job.getInstance(conf, "word count");
            // 指定Class
            job.setJarByClass(WordCountRunner.class);
            // 指定 Mapper Class
            job.setMapperClass(WordCountMapper.class);
            // 指定 Combiner Class,与 reduce 计算逻辑一样
            job.setCombinerClass(WordCountReducer.class);
            // 指定Reucer Class
            job.setReducerClass(WordCountReducer.class);
            // 指定输出的KEY的格式
            job.setOutputKeyClass(Text.class);
            // 指定输出的VALUE的格式
            job.setOutputValueClass(IntWritable.class);
            //设置Reducer 个数默认1
            job.setNumReduceTasks(1);
            // Mapper<Object, Text, Text, IntWritable> 输出格式必须与继承类的后两个输出类型一致
            String filename = "words.txt";
            String args0 = "hdfs://namenode:8020/share/" + filename;
            String args1 = "hdfs://namenode:8020/share/output12/" + filename;
            // 输入路径
            FileInputFormat.addInputPath(job, new Path(args0));
            // 输出路径
            FileOutputFormat.setOutputPath(job, new Path(args1));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
    

    分享并记录所学所见

  • 相关阅读:
    time模块
    Python进程模块
    Django面试题
    基本命令行语句
    scrapy中的配置与中间件
    JSON编码于解码对应dump于load
    python操作数据库
    Python里的方法
    正则表达式
    Python常用模块
  • 原文地址:https://www.cnblogs.com/switchvov/p/14163329.html
Copyright © 2011-2022 走看看