zoukankan      html  css  js  c++  java
  • PHP爬虫百度图片


    php pacong_1.php "酒","电报","电表","电波","电厂","电场","电车","电池","电传","电磁",
    <?php
    error_reporting(E_ALL ^ E_NOTICE);
    set_time_limit(0);
    $word2=explode(",",$argv[1]);
    $pageNum = 35;
    $uuid = md5(uniqid(mt_rand(), true));
    //获取数据
    foreach($word2 as $k=>$v) {
    $encodeWord = urlencode(yang_gbk2utf8($v));
    for ($i=1; $i < $pageNum; $i++) {
    $page = ($i)*30;
    //百度头像图片获取api
    $url = "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&ie=utf-8&oe=utf-8&word=".$encodeWord."&pn=".$page."&rn=30&gsm=700001e&1457697756442=";
    echo $url." ";
    $imgJsonData = file_get_contents($url);
    //随机采样,让每次获取的图片尽量不是相同的,可以增加获取图片api去减少获取重复的图片的几率
    $imgJsonData = json_decode($imgJsonData,true);
    //print_r($imgJsonData);
    foreach ($imgJsonData['data'] as $key => $dataArr) {
    if ($dataArr['thumbURL']) {
    downloadImg($dataArr['thumbURL'],$uuid.$k);
    }
    }
    }
    }


    function downloadImg($url,$k) {
    $path = './userSysAvatorUrl/'.$k."/";
    if (!is_dir($path)) {
    mkdir($path, 0777, true);
    }
    $headers=array(
    "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
    "Connection: keep-alive",
    "Host: img0.imgtn.bdimg.com",
    "Referer: http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%CD%B7%CF%F1&fr=ala&oriquery=%E5%A4%B4%E5%83%8F&ala=1&alatpl=portait&pos=0",
    "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
    "X-Requested-With: XMLHttpRequest"
    );
    $ch = curl_init();
    //设置选项,包括URL
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    //执行并获取HTML文档内容
    $imgBin = curl_exec($ch);
    curl_close($ch);
    list($msec, $sec) = explode(' ', microtime());
    $file_name = (float)sprintf('%.0f', (floatval($msec) + floatval($sec)) * 1000);
    $shullf = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z');
    $rand_key = array_rand($shullf,3);
    $file_name = $shullf[$rand_key[0]].$shullf[$rand_key[1]].$shullf[$rand_key[2]].'_'.$file_name. '.jpg';
    file_put_contents($path.$file_name, $imgBin);
    //释放curl句柄

    }


    function yang_gbk2utf8($str){
    $charset = mb_detect_encoding($str,array('UTF-8','GBK','GB2312'));
    $charset = strtolower($charset);
    if('cp936' == $charset){
    $charset='GBK';
    }
    if("utf-8" != $charset){
    $str = iconv($charset,"UTF-8//IGNORE",$str);
    }
    return $str;
    }

    ?>

  • 相关阅读:
    3.1《想成为黑客,不知道这些命令行可不行》(Learn Enough Command Line to Be Dangerous)——下载文件
    rem实现手机页面自动缩放
    Git 常用命令
    使用 canvas+JS绘制钟表
    JS 操作数组的方法
    Node.js Request方法
    兼容浏览器的点击事件
    ES6知识点
    上传项目到github上
    JavaScript 编码风格
  • 原文地址:https://www.cnblogs.com/xiezhengcai/p/7158462.html
Copyright © 2011-2022 走看看