zoukankan      html  css  js  c++  java
  • 一个php的爬虫,将笔趣阁的书可以都下载下来。

    数据库:book 表id   

    --
    -- 数据库: `book`
    --

    -- --------------------------------------------------------

    --
    -- 表的结构 `id`
    --

    CREATE TABLE IF NOT EXISTS `id` (
    `id` int(11) NOT NULL AUTO_INCREMENT,
    `name` varchar(126) NOT NULL,
    `txt` varchar(126) NOT NULL,
    PRIMARY KEY (`id`)
    ) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=3 ;

    文件

    <?php
    header("Content-type: text/html; charset=utf-8");
    $con = mysql_connect("localhost","root","root");
    if (!$con)
    {
    die('Could not connect: ' . mysql_error());
    }

    mysql_select_db("book", $con);

    $title=array();
    $book=array();
    $key=0;
    $url="http://www.biquge.la";
    function gettitle($value)
    {
    $html=curl_get_contents($value);
    preg_match_all("//book/[0-9]{1,7}//i",$html, $match1);

    foreach ($match1[0] as $key1 => $value1) {
    $ssa=array_search($value1, $GLOBALS["book"]);
    if ($ssa===false) {
    var_dump($value1);
    $GLOBALS["book"][]=$value1;
    $url_book=$GLOBALS["url"].$value1;
    file_put_contents("book.txt", $GLOBALS["url"].$value1.PHP_EOL,FILE_APPEND);
    $html_book=curl_get_contents($url_book);
    $url_book_array=explode("/", $url_book);
    $count_book_num=count($url_book_array);
    $book_num=$url_book_array[$count_book_num-2];
    $html_book=mb_convert_encoding($html_book, "UTF-8", "GBK");
    //var_dump($html);
    preg_match_all("/<dd>.*</dd>/i", $html_book, $match_book);
    preg_match_all("/<title>.*</title>/i", $html_book, $match_book_title_array);
    $match_book_title=preg_replace("/<title>/", "", $match_book_title_array[0][0]);
    $match_book_title=preg_replace("/</title>/", "", $match_book_title);
    $match_book_title_arrayone=explode("_", $match_book_title);
    var_dump($match_book_title_arrayone[0]);
    mysql_query("INSERT INTO `id` (
    `id` ,
    `name` ,
    `txt`
    )
    VALUES (NULL , '".$match_book_title_arrayone[0]."', '".$book_num.".txt"."')");
    foreach ($match_book[0] as $key_book_list => $value_book_list) {
    $chapter_array=explode(""", $value_book_list);
    foreach ($chapter_array as $key_chapter => $value_chapter) {
    if (preg_match("/[0-9]{1,9}.html/", $value_chapter)) {
    $html_chapter=curl_get_contents($url_book.$value_chapter);
    $html_chapter=mb_convert_encoding($html_chapter, "UTF-8", "GBK");
    //var_dump($html);
    preg_match_all("/<div id="content">.*</div>/i", $html_chapter, $match_chapter);
    preg_match_all("/<title>.*</title>/i", $html_chapter, $match_title);
    var_dump($match_title);
    $value_content= $match_title[0][0].PHP_EOL.$match_chapter[0][0];

    $value_content=str_replace("<br />", PHP_EOL, $value_content);

    $value_content=str_replace("&nbsp;", " ", $value_content);

    $value_content=preg_replace("/<script>.*</script>/", "", $value_content);

    $value_content=preg_replace("/<title>/", "", $value_content);
    $value_content=preg_replace("/</title>/", "", $value_content);
    $value_content=preg_replace("/<.*>/", "", $value_content);
    file_put_contents("book/".$book_num.".txt",$value_content.PHP_EOL,FILE_APPEND);
    }
    }
    }
    }
    }
    preg_match_all("/http://www.biquge.la/[a-z]{8,20}//i", $html, $match);
    echo $GLOBALS["key"];
    $GLOBALS["key"]++;
    //var_dump($match);
    while(list($key,$value) = each($match[0])){
    $ss=array_search($value, $GLOBALS["title"]);
    if ($ss===false) {
    var_dump($value);
    $GLOBALS["title"][]=$value;
    file_put_contents("title.txt", $value.PHP_EOL,FILE_APPEND);
    gettitle($value);
    }
    }
    }
    function curl_get_contents($url) {
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_TIMEOUT, 1000);
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36");
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    if (defined('CURLOPT_IPRESOLVE') && defined('CURL_IPRESOLVE_V4')) {
    curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
    }
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    $r = curl_exec($ch);
    curl_close($ch);
    return $r;
    }
    gettitle($url);
    mysql_close($con);
    ?>

  • 相关阅读:
    PHPStorm下XDebug配置
    HDU 4633 Who's Aunt Zhang (Polya定理+快速幂)
    VC++深入详解-第五章学习心得
    nginx access log logrotate配置
    判断变量是否存在(python)
    一步一步学android之布局管理器——LinearLayout
    向前辈致敬 strspn
    poj 1087 (最大流)
    cocos2d-x Touch 事件应用的一个例子
    [置顶] C#扩展方法 扩你所需
  • 原文地址:https://www.cnblogs.com/liuwenbohhh/p/5607113.html
Copyright © 2011-2022 走看看