zoukankan      html  css  js  c++  java
  • PHP抓取豆瓣读书爬虫代码

    <?php
    //演示地址 http://asizu.sinaapp.com/reptile_douban.php
    //数据量不是特别大,没有写抓完数据便停止。 喜欢的朋友拿去自己改改就好了
    header("Content-Type:text/html;charset=utf-8"); define("MYSQL_HOST",SAE_MYSQL_HOST_M); define("MYSQL_NAME","douban"); define("MYSQL_USER",SAE_MYSQL_USER); define("MYSQL_PASSWORD",SAE_MYSQL_PASS); define("MYSQL_PORT",SAE_MYSQL_PORT); $action = $_GET['url']; if(!empty($action)) { $data = getLink($action); $data = str_substr('<div class="article">','<div class="aside">',$data); $dataArray = explode('<dl>',$data); array_splice($dataArray,0,1); foreach($dataArray as $key => $item) { $bookArray[$key]['title'] = str_substr('class="title" target="_blank">','</a>',$item); $bookArray[$key]['rating'] = str_substr('<span class="rating_nums">','</span>',$item); $bookArray[$key]['book_id'] = str_substr('/subject/','/?from',$item); if(!selectBookSaveed($bookArray[$key]['title'])) { insertMysql($bookArray[$key]); } //$dataArray[$key] = htmlspecialchars($item); } $page = explode('book?start=',$action); $new_url = $page[0] ."book?start=". ($page[1] + 15); // var_dump($page); // var_dump($bookArray); } //获取连接 function getLink($url) { $data = getData($url); return $data; } //抓取数据的函数 function getData($url) { //初始化 $ch = curl_init(); //设置选项,包括URL curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_HEADER,0); curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11'); //执行并获取HTML文档内容 $output = curl_exec($ch); //释放curl句柄 curl_close($ch); //返回数据 return $output; } //保存入库 function insertMysql($dataArray) { $field = ""; $value = ""; foreach($dataArray as $key => $item) { $field .= ",".$key; $value .= ",'".$item."'"; } $field = substr($field,1,strlen($field)-1); $value = substr($value,1,strlen($value)-1); $sql = "INSERT INTO douban (".$field.") VALUES(".$value.")"; // var_dump($sql); // exit; $result = mysqlOperation($sql); return $result; } //查询是否已经录入 function selectBookSaveed($title) { $sql = "SELECT * FROM douban WHERE title = '".$title."'"; $result = mysqlOperation($sql,"select"); if(empty($result)) { return false;//未存在 } return true; } //数据库操作方法 function mysqlOperation($sql,$method = "query") { $mysqli = new mysqli( SAE_MYSQL_HOST_M, SAE_MYSQL_USER, SAE_MYSQL_PASS, SAE_MYSQL_DB, SAE_MYSQL_PORT ); if( mysqli_connect_errno() ) echo 'error'; // return false; if($method == "select") return mysqli_fetch_assoc($mysqli->query($sql)); if($method == "query") return $mysqli->query($sql); } // 字符串截取函数 function str_substr($start, $end, $str) { $temp = explode($start, $str, 2); $content = explode($end, $temp[1], 2); return $content[0]; } ?> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.4.4.min.js"></script> <script> $(document).ready(function(){ if($("#input").val() != ""){ $("#froms").submit(); } }); </script> <form action="?" method="get" id="froms"> <input id="input" value="<?php echo $new_url; ?>" type="text" name="url"> <input type="submit"> </form>
  • 相关阅读:
    修复PLSQL Developer 与 Office 2010的集成导出Excel 功能
    Using svn in CLI with Batch
    mysql 备份数据库 mysqldump
    Red Hat 5.8 CentOS 6.5 共用 输入法
    HP 4411s Install Red Hat Enterprise Linux 5.8) Wireless Driver
    变更RHEL(Red Hat Enterprise Linux 5.8)更新源使之自动更新
    RedHat 5.6 问题简记
    Weblogic 9.2和10.3 改密码 一站完成
    ExtJS Tab里放Grid高度自适应问题,官方Perfect方案。
    文件和目录之utime函数
  • 原文地址:https://www.cnblogs.com/jh1994/p/5177403.html
Copyright © 2011-2022 走看看