zoukankan      html  css  js  c++  java
  • linux下nutch的增量抓去脚本

    先收下了。

    # runbot script. to run the Nutch bot for crawling and re-crawling.
    # Usage: bin/runbot [safe]
    #        If executed in 'safe' mode, it doesn't delete the temporary
    #        directories generated during crawl. This might be helpful for
    #        analysis and recovery in case a crawl fails.
    #
    # Author: Susam Pal
     
    depth=2
    threads=5
    adddays=5
    topN=15 #Comment this statement if you don't want to set topN value
     
    # Arguments for rm and mv
    RMARGS="-rf"
    MVARGS="--verbose"
     
    # Parse arguments
    if [ "$1" == "safe" ]
    then
      safe=yes
    fi
     
    if [ -z "$NUTCH_HOME" ]
    #判断
    NUTCH_HOME变量是否被定义
    
    then
      NUTCH_HOME=.
    #如果没有定义,那么定义当前目录为NUTCH_HOME
    
      echo runbot: $0 could not find environment variable NUTCH_HOME
      echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script. 
    else
      echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME 
    fi
     
    if [ -z "$CATALINA_HOME" ]
    #判断
    CATALINA_HOME变量是否被定义
    
    then
      CATALINA_HOME=/opt/apache-tomcat-6.0.10
    #如果没有定义,那么定义/opt/apache-tomcat-6.0.10为CATALINA_HOME
    
     
      echo runbot: $0 could not find environment variable NUTCH_HOME
      echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script. 
    else
      echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME 
    fi
     
    if [ -n "$topN" ]
    #判断
    topN变量是否被定义
    
    then
      topN="-topN $topN"
    #如果被定义了,那么定义topN=”-topN $topN”
    
    else
      topN=""
    #如果没有定义,那么定义个空值
    
    fi
     
    steps=8
    echo "----- Inject (Step 1 of $steps) -----"
    $NUTCH_HOME/bin/nutch inject crawl/crawldb urls
    #注入网址到抓去数据库,此处的crawl/crawldb是你的nutch抓取产生的数据存放目录,强烈建议使用绝对路径。urls是要抓取的网址,也用绝对路径
    
    echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
    for((i=0; i < $depth; i++))
    do
      echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
      $NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN \
          -adddays $adddays
    #从crawldb中创建抓取列表,此处的crawl/crawldb和crawl/segments也要用绝对路径。
    
      if [ $? -ne 0 ]
      then
        echo "runbot: Stopping at depth $depth. No more URLs to fetch."
        break
      fi
      segment=`ls -d crawl/segments/* | tail -1`
    #此处路径也要改成绝对路径。
    
      $NUTCH_HOME/bin/nutch fetch $segment -threads $threads
      if [ $? -ne 0 ]
      then
        echo "runbot: fetch $segment at depth `expr $i + 1` failed."
        echo "runbot: Deleting segment $segment."
        rm $RMARGS $segment
        continue
      fi
     
      $NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
    done
    #此处的crawl/crawldb改为绝对路径。
    
    echo "----- Merge Segments (Step 3 of $steps) -----"
    $NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
    #此处的crawl/MERGEDsegments和crawl/segments/*改为绝对路径。
    
     
    if [ "$safe" != "yes" ]
    then
      rm $RMARGS crawl/segments
    else
      rm $RMARGS crawl/BACKUPsegments
      mv $MVARGS crawl/segments crawl/BACKUPsegments
    fi
     
    mv $MVARGS crawl/MERGEDsegments crawl/segments
     
    echo "----- Invert Links (Step 4 of $steps) -----"
    $NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
    #此处的crawl/linkdb和crawl/segments/*改为绝对路径。
    
     
    echo "----- Index (Step 5 of $steps) -----"
    $NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb \
    crawl/segments/*
    #此处的crawl/crawldb和crawl/linkdb和crawl/segments/*改为绝对路径
    
    echo "----- Dedup (Step 6 of $steps) -----"
    $NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
    #此处的crawl/NEWindexes改为绝对路径
    
    echo "----- Merge Indexes (Step 7 of $steps) -----"
    $NUTCH_HOME/bin/nutch merge crawl/NEWindex crawl/NEWindexes
    #此处的crawl/NEWindex和crawl/NEWindexes改为绝对路径
    
     
    echo "----- Loading New Index (Step 8 of $steps) -----"
    ${CATALINA_HOME}/bin/shutdown.sh
     
    if [ "$safe" != "yes" ]
    then
      rm $RMARGS crawl/NEWindexes
      rm $RMARGS crawl/index
    #全为绝对路径
    
    else
      rm $RMARGS crawl/BACKUPindexes
      rm $RMARGS crawl/BACKUPindex
      mv $MVARGS crawl/NEWindexes crawl/BACKUPindexes
      mv $MVARGS crawl/index crawl/BACKUPindex
    #全为绝对路径
    
     
    fi
     
    mv $MVARGS crawl/NEWindex crawl/index
    # crawl/NEWindex和crawl/index为绝对路径
    
     
    ${CATALINA_HOME}/bin/startup.sh
     
    echo "runbot: FINISHED: Crawl completed!"
    echo ""

    此脚本来源于http://wiki.apache.org/nutch/Crawl,请大家去下载最原始的脚本(在复制粘贴时一定要注意脚本的开头,本人被她害的很厉害,他少复制了一些代码),本文脚本由于写了一些注释,很有可能造成不能正常执行(由于文本编辑器的原因)。

    脚本使用方法:

    1.    在$NUTCH_HOME/bin目录中建立runbot文件,vi runbot,然后将该脚本复制到runbot文件中,赋予runbot文件可执行权限,chmod 744 runbot。

    2.    添加环境变量NUTCH_HOME和CATALINA_HOME

    在/etc/profile文件末尾加入如下代码

    export NUTCH_HOME=你的nutch安装根目录

    export CATALINA_HOME=你的tomcat安装根目录

    重新登录。

    3.    该脚本默认等情况下,是将抓取的数据放在当前目录中。如果不想在上述脚本注释的地方更改绝对路径,请在nutch的抓取数据根目录执行次脚本。(如果改了可以在任意目录执行该脚本)。还有一个更加简单的方法那就是在脚本的开头# Author: Susam Pal后面加入如下命令

     

    cd /crawldb/crawldb(此处是你的nutch抓取的数据目录),也可以达到在任何目录执行该脚本的目地,此时urls文件也因该在nutchu抓取数据的根目录。

    附加http://www.ncepuideal.com/space/viewspacepost.aspx?postid=246该篇文章介绍了如何配置nutch。

    http://www.ncepuideal.com/space/viewspacepost.aspx?postid=247该篇文章介绍了如何在tomacat中部署nutch。


    原文地址:http://ideal.ncepu.me/2012/03/25/1-6/

  • 相关阅读:
    ssd笔记
    深度学习 参数笔记
    NVIDIA驱动安装
    下载大文件笔记
    vue中使用echart笔记
    torch.no_grad
    暑期第二周总结
    暑期第一周总结
    第十六周学习进度
    期末总结
  • 原文地址:https://www.cnblogs.com/dgy5554/p/3973417.html
Copyright © 2011-2022 走看看