zoukankan      html  css  js  c++  java
  • 配置RHadoop与运行WordCount例子

    1、安装R语言环境

    su -c 'rpm -Uvh http://download.fedoraproject.org/pub/epel/6/i386/epel-release-6-8.noarch.rpm'

    su -c 'yum install foo'

    yum list R-*

    yum install R

    2、安装RStudio Desktop和Server

    Desktop是rpm包,双击执行

    Server安装命令:

    yum install openssl098e # Required only for RedHat/CentOS 6 and 7

    wget http://download2.rstudio.org/rstudio-server-0.98.1091-x86_64.rpm

    yum install --nogpgcheck rstudio-server-0.98.1091-x86_64.rpm

    添加r-user用户

    3、安装gcc、git、pkg-config

    yum install gcc git pkg-config

    4、安装thrift0.9.0

    yum install automake libtool flex bison pkgconfig gcc-c++ boost-devel libevent-devel zlib-devel python-devel ruby-devel

    编译安装步骤:

    Update the System

        yum -y update

    Install the Platform Development Tools

        yum -y groupinstall "Development Tools"

    Upgrade autoconf/automake/bison

        yum install -y wget

    Upgrade autoconf

        wget http://ftp.gnu.org/gnu/autoconf/autoconf-2.69.tar.gz

        tar xvf autoconf-2.69.tar.gz

        cd autoconf-2.69

        ./configure --prefix=/usr

        make

        make install

    Upgrade automake

        wget http://ftp.gnu.org/gnu/automake/automake-1.14.tar.gz

        tar xvf automake-1.14.tar.gz

        cd automake-1.14

        ./configure --prefix=/usr

        make

        make install

    Upgrade bison

        wget http://ftp.gnu.org/gnu/bison/bison-2.5.1.tar.gz

        tar xvf bison-2.5.1.tar.gz

        cd bison-2.5.1

        ./configure --prefix=/usr

        make

        make install

    Install C++ Lib Dependencies

        yum -y install libevent-devel zlib-devel openssl-devel

    Upgrade Boost

        wget http://sourceforge.net/projects/boost/files/boost/1.55.0/boost_1_55_0.tar.gz

        tar xvf boost_1_55_0.tar.gz

        cd boost_1_55_0

        ./bootstrap.sh

        ./b2 install

    Build and Install the Apache Thrift IDL Compiler

        git clone https://git-wip-us.apache.org/repos/asf/thrift.git

        cd thrift

        ./bootstrap.sh

        ./configure --with-lua=no

        修改/thrift-0.9.1/lib/cpp/thrift.pc的includedir=${prefix}/include/thrift

        make

        make install

    Update PKG_CONFIG_PATH:

        export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig/

    Verifiy pkg-config path is correct:

        pkg-config --cflags thrift

        returns:

        -I /usr/local/include/thrift

    拷贝文件到lib文件夹

        cp /usr/local/lib/libthrift-1.0.0-dev.so /usr/lib/

    5、设置Linux环境变量

    export HADOOP_PREFIX=/usr/lib/hadoop

    export HADOOP_CMD=/usr/lib/hadoop/bin/hadoop

    export HADOOP_STREAMING=/usr/lib/hadoop-mapreduce/hadoop-streaming.jar

    6、root用户下开启R环境安装依赖包

    install.packages(c("rJava", "Rcpp", "RJSONIO", "bitops", "digest",

                        "functional", "stringr", "plyr", "reshape2", "dplyr",

                        "R.methodsS3", "caTools", "Hmisc", "data.table", "memoise"))

    7、root用户下开启R环境安装RHadoop包

    install.packages("/root/RHadoop/rhdfs_1.0.8.tar.gz", repos=NULL, type="source")

    install.packages("/root/RHadoop/rmr2_3.3.0.tar.gz", repos=NULL, type="source")

    install.packages("/root/RHadoop/plyrmr_0.5.0.tar.gz", repos=NULL, type="source")

    install.packages("/root/RHadoop/rhbase_1.2.1.tar.gz", repos=NULL, type="source")

    8、配置ant 和 maven

    export MAVEN_HOME=/root/apache-maven-3.2.5

    export PATH=/root/apache-maven-3.2.5/bin:$PATH

    export ANT_HOME=/root/apache-ant-1.9.4

    export PATH=$ANT_HOME/bin:$PATH

    9、测试RHadoop

    Sys.setenv("HADOOP_PREFIX"="/usr/lib/hadoop")

    Sys.setenv("HADOOP_CMD"="/usr/lib/hadoop/bin/hadoop")

    Sys.setenv("HADOOP_STREAMING"="/usr/lib/hadoop-mapreduce/hadoop-streaming.jar")

        library(rmr2)

        bp = rmr.options("backend.parameters")

        trans <- list(D="mapreduce.map.java.opts=-Xmx400M",

                     D="mapreduce.reduce.java.opts=-Xmx400M",

                     D="mapreduce.map.memory.mb=4096",

                     D="mapreduce.reduce.memory.mb=4096",

                     D="mapreduce.task.io.sort.mb=100")

        bp <- list(hadoop=trans)

        #### 没有使用的代码 开始 #######################

        bp$hadoop[1]="mapreduce.map.java.opts=-Xmx400M"

        bp$hadoop[2]="mapreduce.reduce.java.opts=-Xmx400M"

        bp$hadoop[3]="mapreduce.map.memory.mb=1024"

        bp$hadoop[4]="mapreduce.reduce.memory.mb=2048"

        bp$hadoop[5]="mapreduce.task.io.sort.mb=100"

        #### 没有使用的代码 结束 #######################

        rmr.options(backend.parameters = bp)

        rmr.options("backend.parameters")

        ## map function

        map <- function(k,lines) {

            words.list <- strsplit(lines, '\s')

            words <- unlist(words.list)

            return( keyval(words, 1) )

        }

        ## reduce function

        reduce <- function(word, counts) {

            keyval(word, sum(counts))

        }

        wordcount <- function (input, output=NULL) {

            mapreduce(input=input, output=output, input.format="text",

    map=map, reduce=reduce)

        }

        ## delete previous result if any

        system("/usr/lib/hadoop/bin/hadoop fs -rm -r /tmp/zhengcong/out")

        

        ## Submit job

        hdfs.root <- '/tmp/zhengcong'

        hdfs.data <- file.path(hdfs.root, 'hp')

        hdfs.out <- file.path(hdfs.root, 'out')

        out <- wordcount(hdfs.data, hdfs.out)

        ## Fetch results from HDFS

        results <- from.dfs(out)

        ## check top 30 frequent words

        results.df <- as.data.frame(results, stringsAsFactors=F)

        colnames(results.df) <- c('word', 'count')

        head(results.df[order(results.df$count, decreasing=T), ], 30)

    10、错误解决

        rJava无法加载,root用户下运行 R CMD javareconf -e

        添加 export LD_LIBRARY_PATH=$JAVA_HOME/lib/amd64:$JAVA_HOME/jre/lib/amd64/server

  • 相关阅读:
    Java实现 LeetCode 537 复数乘法(关于数学唯一的水题)
    Java实现 LeetCode 537 复数乘法(关于数学唯一的水题)
    Java实现 LeetCode 535 TinyURL 的加密与解密(位运算加密)
    Java实现 LeetCode 535 TinyURL 的加密与解密(位运算加密)
    如何在 Linux 中统计一个进程的线程数
    linux下查看线程数的几种方法
    深入理解linux系统下proc文件系统内容
    嵌入式 如何定位死循环或高CPU使用率(linux)
    Linux 下查看线程信息
    Linux netstat命令详解
  • 原文地址:https://www.cnblogs.com/zhengcong/p/4208506.html
Copyright © 2011-2022 走看看