zoukankan      html  css  js  c++  java
  • php 扫描url死链接

    * 从Packagist上搜索需要的包

      https://packagist.org/

    * 通过composer下载依赖包

    composer require guzzlehttp/guzzle
    composer require league/csv
    

      

    * 使用composer自动加载器, 编写scan.php

    <?php
    
    // 1. 使用composer自动加载器
    require 'vendor/autoload.php';
    
    use GuzzleHttpRequestOptions;
    
    // 2. 实例Guzzle HTTP客户端
    $client = new GuzzleHttpClient();
    
    $options = [
        RequestOptions::TIMEOUT => 3,
        RequestOptions::DECODE_CONTENT => false,
        RequestOptions::HEADERS => [
            'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        ]
    ];
    
    // 3. 打开迭代处理csv
    // 或者传用户自定义命令行参数指定输入文件 $argv[1]
    // $file = new SplFileObject('../data/t_video.csv'); /* $csvRow[4], $csvRow[5] */
    $file = new SplFileObject('../data/urls.csv');
    $csv = LeagueCsvReader::createFromFileObject($file);
    foreach ($csv as $csvRow) {
        $url = $csvRow[0];
        echo 'scanning ',$url,'... ';
        try {
            // 4. 发送http options请求
            $httpResponse = $client->request('GET', $url, $options);
    
            // 5. 检查http相应的状态码
            $code = $httpResponse->getStatusCode();
            if ($code === 200) {
                echo "33[32m[OK]33[0m",PHP_EOL;
            } else {
                throw new Exception();
            }
    
        } catch (Exception $e) {
            // 6. 把死链发给标准输出
            // echo $url.PHP_EOL;
            echo "33[31m[ERROR]33[0m ".$e->getMessage().PHP_EOL;
        }
    }
    

      

    * input csv:

    ../data/urls.csv

    https://www.baidu.com
    https://mail.qq.com/cgi-bin/frame_html?sid=CYcBjsDbOqznWhVO&r=375cccc57697ed7d00ae5d751663a71c
    https://pan.baidu.com/disk/home?errno=0&errmsg=Auth%20Login%20Sucess&&bduss=&ssnerror=0&traceid=#/all?vmode=list&path=%2F05.php%2F25K%20PHP%E9%9D%A2%E8%AF%95%E8%A7%86%E9%A2%91%E6%95%99%E7%A8%8B
    http://dict.youdao.com/w/eng/components/#keyfrom=dict2.index
    http://php.net/manual/en/splfileobject.fwrite.php
    https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=2&tn=baiduhome_pg&wd=ansi%20%E7%BB%88%E7%AB%AF%E9%A2%9C%E8%89%B2%20%5B%5C33&rsv_spt=1&oq=ansi%2520%25E7%25BB%2588%25E7%25AB%25AF%25E9%25A2%259C%25E8%2589%25B2&rsv_pq=8b17bd6e0027882b&rsv_t=fcf6oR2SbHi9Cpu2eThdv3AQvGwSDf7ecjv7QBvjXoZ3SMpBem3pdNzlNRNmuOW%2BEowe&rqlang=cn&rsv_enter=1&inputT=2640&rsv_sug3=68&rsv_sug2=0&rsv_sug4=3243
    https://blog.csdn.net/SLASH_24/article/details/54846392
    https://www.jb51.net/article/42358.htm
    https://www.cnblogs.com/xudong-bupt/p/3721210.html
    http://www.cnblogs.com/mingzhanghui/p/9314906.html
    https://packagist.org/packages/maatwebsite/excel
    https://www.phptherightway.com/#use_the_current_stable_version
    https://doc.phpspider.org/methods.html
    http://nosuchurl
    http://deadurl
    

      

    output:

    * 在Linux终端输出带颜色的文字的方法

      注意 echo "" 要用双引号, 单引号会原样输出 33[32mxxx33[0m

    一、shell下的实现方法

    只要设置输出属性,就可输出带颜色的文字 ,shell中的部分属性:

          33[0m 关闭所有属性
          33[1m 设置高亮度
          33[4m 下划线
          33[5m 闪烁
          33[7m 反显
          33[8m 消隐
          33[30m 至 33[37m 设置前景色
          33[40m 至 33[47m 设置背景色
          33[nA 光标上移n行 
          33[nB 光标下移n行
          33[nC 光标右移n行
          33[nD 光标左移n行
          33[y;xH设置光标位置
          33[2J 清屏
          33[K 清除从光标到行尾的内容
          33[s 保存光标位置 
          33[u 恢复光标位置
          33[?25l 隐藏光标
          33[?25h 显示光标
     
    --------------------------------------------------------------------------
          各数字所代表的颜色如下:
          字背景颜色范围:40----49
          40:黑
          41:深红
          42:绿
          43:黄色
          44:蓝色
          45:紫色
          46:深绿
          47:白色

          字颜色:30----39
          30:黑
          31:红
          32:绿
          33:黄
          34:蓝色
          35:紫色
          36:深绿 
          37:白色
     
          使用,如:echo -e "33[34mHello, world!" (-e作用是引导设置输出属性),
          恢复属性为默认值:echo -e  "33[0m",
          同类的多种设置项可以组合在一起,中间用分号(;)隔开。如下:
          echo -e "33[20;1H33[1;4;34mHello,world33[0m"

    ===================================================================================

    php 命令行脚本

    http://php.net/manual/en/wrappers.php.php

    http://php.net/manual/en/reserved.variables.argv.php

    http://php.net/manual/en/reserved.variables.argc.php

    ====================================================================================

    scanner.php 

    不在终端打印 返回数组

     1 <?php
     2 /**
     3  * Created by PhpStorm.
     4  * User: Mch
     5  * Date: 7/17/18
     6  * Time: 21:34
     7  */
     8 namespace TsinghuadtvModernPHPUrl;
     9 
    10 // composer require guzzlehttp/guzzle
    11 require 'vendor/autoload.php';
    12 
    13 use GuzzleHttpRequestOptions;
    14 
    15 class Sanner {
    16     protected $urls;
    17 
    18     protected $httpClient;
    19 
    20     protected $options = [
    21         RequestOptions::VERSION => 1.1,
    22         RequestOptions::TIMEOUT => 3,
    23         RequestOptions::DECODE_CONTENT => false,
    24         RequestOptions::HEADERS => [
    25             'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    26         ]
    27     ];
    28 
    29     public function __construct(array $urls) {
    30         $this->urls = $urls;
    31         $this->httpClient = new GuzzleHttpClient();
    32     }
    33 
    34     public function getInvalidUrls() {
    35         $invalidUrls = [];
    36         foreach ($this->urls as $url) {
    37             try {
    38                 $statusCode = $this->getStatusCodeForUrl($url);
    39             } catch (Exception $e) {
    40                 $statusCode = 500;
    41             }
    42             if ($statusCode >= 400) {
    43                 array_push($invalidUrls, [
    44                     'url' => $url,
    45                     'status' => $statusCode
    46                 ]);
    47             }
    48         }
    49         return $invalidUrls;
    50     }
    51 
    52     protected function getStatusCodeForUrl($url) {
    53         $httpResponse = $this->httpClient->request('get', $url, $this->options);
    54         return $httpResponse->getStatusCode();
    55     }
    56 
    57 }
    scanner.php

    调用scanner.php测试

    假设这个包提交到 modernphp/scanner  https://packagist.org

    composer require modernphp/scanner

     1 <?php
     2 /**
     3  * Created by PhpStorm.
     4  * User: Mch
     5  * Date: 7/17/18
     6  * Time: 21:41
     7  */
     8 // require 'vendor/autoload.php';
     9 include 'scanner.php';
    10 
    11 $urls = [
    12     'http://www.apple.com',
    13     'http://nosuchurl',
    14     'https://www.cnblogs.com/mingzhanghui/p/9317179.html',
    15     'https://www.baidu.com',
    16     'http://jp2.php.net',
    17     'http://sdfssdwerw.org'
    18 ];
    19 
    20 $scanner = new TsinghuadtvModernPHPUrlSanner($urls);
    21 print_r($scanner->getInvalidUrls());
    index.php

    output:

    Array (

        [0] => Array ([url] => http://nosuchurl   [status] => 500 )

        [1] => Array([url] => http://sdfssdwerw.org  [status] => 500 )

    )

  • 相关阅读:
    微信开发 之 开启开发模式
    微信公众号开发 之 编辑模式使用
    分析各种Android设备屏幕分辨率与适配
    【面向对象设计模式】 适配器模式 (二)
    重构 之 总结代码的坏味道 Bad Smell (一) 重复代码 过长函数 过大的类 过长参数列 发散式变化 霰弹式修改
    【Android 应用开发】Android资源文件
    java 创建并写入文件
    隐藏 HttpClient 在console的日志
    HOW TO CHANGE THE DEFAULT KEY-VALUE SEPARATOR OF A MAPREDUCE JOB
    java 时间戳转换
  • 原文地址:https://www.cnblogs.com/mingzhanghui/p/9315254.html
Copyright © 2011-2022 走看看