zoukankan      html  css  js  c++  java
  • share一段采集程序的代码

    <?php
    set_time_limit(0);
     
    //cookie保存目录
    $cookie_jar = '/tmp/cookie.tmp';
     
    /*函数------------------------------------------------------------------------------------------------------------*/
     
    //模拟请求数据
    function request($url,$postfields,$cookie_jar,$referer){
    $ch = curl_init();
    $options = array(CURLOPT_URL => $url,
      CURLOPT_HEADER => 0,
      CURLOPT_NOBODY => 0,
      CURLOPT_PORT => 80,
      CURLOPT_POST => 1,
      CURLOPT_POSTFIELDS => $postfields,
      CURLOPT_RETURNTRANSFER => 1,
      CURLOPT_FOLLOWLOCATION => 1,
      CURLOPT_COOKIEJAR => $cookie_jar,
      CURLOPT_COOKIEFILE => $cookie_jar,
      CURLOPT_REFERER => $referer
    );
    curl_setopt_array($ch, $options);
    $code = curl_exec($ch);
    curl_close($ch);
    return $code;
    }
     
    //获取帖子列表
    function getThreadsList($code){
    preg_match_all('/<!--[.|
    |
    ]*?<a href="viewthread.php?tid=(d+)/',$code,$threads);
    return $threads[1];
    }
     
    //判断该帖子是否存在
    function isExits($code){
    preg_match('/<p>指定的主题不存在或已被删除或正在被审核,请返回。</p>/',$code,$error);
    return isset($error[0])?false:true;
    }
     
    //获取帖子标题
    function getTitle($code){
    preg_match('/<h1>[^</h1>]*/',$code,$title_tmp);
    $title = $title_tmp[0];
    return $title;
    }
     
    //获取帖子作者:
    function getAuthor($code){
    preg_match('/<a href="space.php?uid=d+" target="_blank" id="userinfod+" onmouseover="showMenu(this.id)">.+/',$code,$author_tmp);
    $author = strip_tags($author_tmp[0]);
    return $author;
    }
     
    //获取楼主发表的内容
    function getContents($code){
    preg_match('/<div id="postmessage_d+" class="t_msgfont">(.|
    |
    )*?</div>/',$code,$contents_tmp);
    $contents = preg_replace('/images//','http://bbs.war3.cn/images/',$contents_tmp[0]);
    return $contents;
    }
     
    //打印帖子标题
    function printTitle($title){
    echo "<strong><h2>帖子标题:</h2></strong>",strip_tags($title),"<br/><br/>";
    }
     
    //输出帖子作者
    function printAuthor($author){
    echo "<strong><h2>帖子作者:</h2></strong>",strip_tags($author),"<br/><br/>";
    }
     
    //打印帖子内容
    function printContents($contents){
    echo "<strong><h2>作者发表的内容:</h2>",$contents,"</strong><br/>";
    }
     
    //错误
    function printError(){
    echo "<i>该帖子不存在!</i>";
    }
     
    /*函数列表end---------------------------------------------------------------------------------------------------*/
     
     
    /*登录论坛 begin*/
    $url = 'http://bbs.war3.cn/logging.php?action=login';
    $postfields='loginfield=username&username=1nject10n&password=xxxxxx&questionid=0&cookietime=315360000&referer=http://bbs.war3.cn/&loginsubmit=提交';
    request($url,$postfields,$cookie_jar,'');
    unset($postfields,$url);
    /*登录论坛 end*/
     
     
    /*获取帖子列表(位于第一页的帖子) begin*/
    $url = 'http://bbs.war3.cn/forumdisplay.php?fid=57';
    $code = request($url,'',$cookie_jar,'');
    $threadsList = getThreadsList($code);
    /*获取帖子列表 end*/
     
    //帖子序列
    $rows = 0;
     
    /*循环抓取所有帖子源代码 begin*/
    foreach($threadsList as $list){
    $url = "http://bbs.war3.cn/viewthread.php?tid=$list";
     
    if(isExits($code)){
    $code = request($url,'',$cookie_jar,'');
    $color = $rows%2==0?'#00CCFF':'#FFFF33';
    echo "<div style='background-color:$color'>";
    echo "<h1>第",($rows+1),"贴:</h1><br/>";
    $author = getAuthor($code);
    printAuthor($author);
     
    $title = getTitle($code);
    printTitle($title);
     
    $contents = getContents($code);
    printContents($contents);
    echo "</div>";
    $rows++;
    }
    else
    printError();
     
    echo "-----------------------------------------------------------------------------------------<br/><br/>";
    }
    /*抓取源代码 end*/
    ?>
  • 相关阅读:
    如何:将控件锁定到 Windows 窗体
    Linux 设置字符集
    sql 批量处理
    解决表被锁了
    oracle 分页模板
    创建用户及表空间
    恢复数据库数据
    instr vs like 效率
    自定义参数转换器
    spring boot 整合MyBatis
  • 原文地址:https://www.cnblogs.com/shanyansheng/p/5474141.html
Copyright © 2011-2022 走看看