zoukankan      html  css  js  c++  java
  • 分享一个强大的采集类,还可以模拟php多进程

    做采集的时候,可以使用file_get_contents()去获取网页源代码,但是使用file_get_contents采集,速度慢,而且超时时间,不好控制。如果采集的页面不存在,需要等待的时间很长。一般来说,curl的速度最快,其次是socket,最后是file_get_contents。
    现在跟大家分享一个很强大的采集类,会根据你的服务器当前的配置,自动选择最快的方式。已经封装了curl和socket,file_get_contents

    用法很简单:
    1,采用get方法请求
    Http::doGet(网址);//超市时间可忽略,默认是5秒
    Http::doGet(网址,超时时间);
    如echo Http::doGet('http://www.baidu.com');

    2,采用post方法请求
    Http::doPost(网址,数据,超时时间);


    $url='http://www.canphp.com/test.php';
    $data['name']='单骑';
    $data['email']='admin@canphp.com';
    Http::doPost($url,$data,10);

    test.php页面接收数据
    $_POST['name'];
    $_POST['email'];

    这个http类不仅可以用来采集,还有一个很强大的作用,模拟php异步多进程。
    比如有index.php和a.php,  b.php,  c.php
    在index.php中
    Http::doGet('http://www.canphp.com/a.php',1);
    Http::doGet('http://www.canphp.com/b.php',1);
    Http::doGet('http://www.canphp.com/c.php',1);

    a.php,  b.php,  c.php程序分别在头部加上ignore_user_abort(true);
    那么就可以实现多进程了。

    原理:
    通过curl或socket发送请求给a.php,  b.php,  c.php,由于超时时间比较短,只是触发了a.php,  b.php,  c.php三个页面,不需要等待数据返回,连接已中断,但是a.php,  b.php,  c.php程序中加上了ignore_user_abort(true);忽略客户端连接,还会继续执行。

    具体案例可以观看很邪恶很强大的av程序(http://www.canphp.com/bbs/thread-295-1-1.html)
    1. <?php
    2. //数据采集,doGET,doPOST
    3. class Http
    4. {//类定义开始
    5. //通过get方式获取数据
    6. static public function doGet($url,$timeout=5)
    7. {
    8. $code=self::getSupport();
    9. switch($code)
    10. {
    11. case 1:return self::curl($url,'',$timeout);break;
    12. case 2:return self::socketGet($url,$timeout);break;
    13. case 3:return @file_get_contents($url);break;
    14. default:return false;
    15. }
    16. }
    17. //通过POST方式发送数据
    18. static public function doPost($url,$data,$timeout=5)
    19. {
    20. $code=self::getSupport();
    21. switch($code)
    22. {
    23. case 1:return self::curl($url,$data,$timeout);break;
    24. case 2:return self::socketPost($url,$data,$timeout);break;
    25. default:return false;
    26. }
    27. }
    28. //获取支持读取远程文件的方式
    29. static public function getSupport()
    30. {
    31. if(function_exists('curl_init'))//curl方式
    32. {
    33. return 1;
    34. }
    35. else if(function_exists('fsockopen'))//socket
    36. {
    37. return 2;
    38. }
    39. else if(function_exists('file_get_contents'))//php系统函数file_get_contents
    40. {
    41. return 3;
    42. }
    43. else if(ini_get('allow_url_fopen')&&function_exists('fopen'))//php系统函数fopen
    44. {
    45. return 4;
    46. }
    47. else
    48. {
    49. return 0;
    50. }
    51. }
    52. static public function GetHttpContent($fsock=null) {
    53. $out = null;
    54. while($buff = @fgets($fsock, 2048)){
    55. $out .= $buff;
    56. }
    57. fclose($fsock);
    58. $pos = strpos($out, " ");
    59. $head = substr($out, 0, $pos); //http head
    60. $status = substr($head, 0, strpos($head, " ")); //http status line
    61. $body = substr($out, $pos + 4, strlen($out) - ($pos + 4));//page body
    62. if(preg_match("/^HTTP/d.ds([d]+)s.*$/", $status, $matches)){
    63. if(intval($matches[1]) / 100 == 2){
    64. return $body;
    65. }else{
    66. return false;
    67. }
    68. }else{
    69. return false;
    70. }
    71. }
    72. static public function socketGet($url,$timeout=5){
    73. $url2 = parse_url($url);
    74. $url2["path"] = isset($url2["path"])? $url2["path"]: "/" ;
    75. $url2["port"] = isset($url2["port"])? $url2["port"] : 80;
    76. $url2["query"] = isset($url2["query"])? "?".$url2["query"] : "";
    77. $host_ip = @gethostbyname($url2["host"]);
    78. $fsock_timeout = $timeout; //超时时间
    79. if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
    80. return false;
    81. }
    82. $request = $url2["path"] .$url2["query"];
    83. $in = "GET " . $request . " HTTP/1.1 ";
    84. $in .= "Accept: */* ";
    85. // $in .= "User-Agent: Payb-Agent ";
    86. $in .= "Host: " . $url2["host"] . " ";
    87. $in .= "Connection: Close ";
    88. if(!@fwrite($fsock, $in, strlen($in))){
    89. @fclose($fsock);
    90. return false;
    91. }
    92. return self::GetHttpContent($fsock);
    93. }
    94. static public function socketPost($url,$post_data=array(),$timeout=5){
    95. $url2 = parse_url($url);
    96. $url2["path"] = ($url2["path"] == "" ? "/" : $url2["path"]);
    97. $url2["port"] = ($url2["port"] == "" ? 80 : $url2["port"]);
    98. $host_ip = @gethostbyname($url2["host"]);
    99. $fsock_timeout = $timeout; //超时时间
    100. if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
    101. return false;
    102. }
    103. $request = $url2["path"].($url2["query"] ? "?" . $url2["query"] : "");
    104. $post_data2 = http_build_query($post_data);
    105. $in = "POST " . $request . " HTTP/1.1 ";
    106. $in .= "Accept: */* ";
    107. $in .= "Host: " . $url2["host"] . " ";
    108. // $in .= "User-Agent: Lowell-Agent ";
    109. $in .= "Content-type: application/x-www-form-urlencoded ";
    110. $in .= "Content-Length: " . strlen($post_data2) . " ";
    111. $in .= "Connection: Close ";
    112. $in .= $post_data2 . " ";
    113. unset($post_data2);
    114. if(!@fwrite($fsock, $in, strlen($in))){
    115. @fclose($fsock);
    116. return false;
    117. }
    118. return self::GetHttpContent($fsock);
    119. }
    120. static public function curl($url, $data=array(), $timeout=5)
    121. {
    122. $ch = curl_init();
    123. if (is_array($data) && $data)
    124. {
    125. $formdata = http_build_query($data);
    126. curl_setopt($ch, CURLOPT_POST, true);
    127. curl_setopt($ch, CURLOPT_POSTFIELDS, $formdata);
    128. }
    129. curl_setopt($ch, CURLOPT_URL, $url);
    130. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    131. curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    132. curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    133. $result = curl_exec($ch);
    134. curl_close($ch);
    135. return $result;
    136. }
    137. }//类定义结束
    138. ?>
    复制代码

    <?php
    //数据采集,doGET,doPOST
    class Http
    {//类定义开始
    //通过get方式获取数据
    static public function doGet($url,$timeout=5)
    {
    $code=self::getSupport();
    switch($code)
    {
    case 1:return self::curl($url,'',$timeout);break;
    case 2:return self::socketGet($url,$timeout);break;
    case 3:return @file_get_contents($url);break;
    default:return false;
    }
    }
    //通过POST方式发送数据
    static public function doPost($url,$data,$timeout=5)
    {
    $code=self::getSupport();
    switch($code)
    {
    case 1:return self::curl($url,$data,$timeout);break;
    case 2:return self::socketPost($url,$data,$timeout);break;
    default:return false;
    }
    }

    //获取支持读取远程文件的方式
    static public function getSupport()
    {
    if(function_exists('curl_init'))//curl方式
    {
    return 1;
    }
    else if(function_exists('fsockopen'))//socket
    {
    return 2;
    }
    else if(function_exists('file_get_contents'))//php系统函数file_get_contents
    {
    return 3;
    }
    else if(ini_get('allow_url_fopen')&&function_exists('fopen'))//php系统函数fopen
    {
    return 4;
    }
    else
    {
    return 0;
    }
    }
    static public function GetHttpContent($fsock=null) {
    $out = null;
    while($buff = @fgets($fsock, 2048)){
    $out .= $buff;
    }
    fclose($fsock);
    $pos = strpos($out, " ");
    $head = substr($out, 0, $pos); //http head
    $status = substr($head, 0, strpos($head, " ")); //http status line
    $body = substr($out, $pos + 4, strlen($out) - ($pos + 4));//page body
    if(preg_match("/^HTTP/d.ds([d]+)s.*$/", $status, $matches)){
    if(intval($matches[1]) / 100 == 2){
    return $body;
    }else{
    return false;
    }
    }else{
    return false;
    }
    }
    static public function socketGet($url,$timeout=5){
    $url2 = parse_url($url);
    $url2["path"] = isset($url2["path"])? $url2["path"]: "/" ;
    $url2["port"] = isset($url2["port"])? $url2["port"] : 80;
    $url2["query"] = isset($url2["query"])? "?".$url2["query"] : "";
    $host_ip = @gethostbyname($url2["host"]);
    $fsock_timeout = $timeout; //超时时间
    if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
    return false;
    }
    $request = $url2["path"] .$url2["query"];
    $in = "GET " . $request . " HTTP/1.1 ";
    $in .= "Accept: */* ";
    // $in .= "User-Agent: Payb-Agent ";
    $in .= "Host: " . $url2["host"] . " ";
    $in .= "Connection: Close ";
    if(!@fwrite($fsock, $in, strlen($in))){
    @fclose($fsock);
    return false;
    }
    return self::GetHttpContent($fsock);
    }

    static public function socketPost($url,$post_data=array(),$timeout=5){
    $url2 = parse_url($url);
    $url2["path"] = ($url2["path"] == "" ? "/" : $url2["path"]);
    $url2["port"] = ($url2["port"] == "" ? 80 : $url2["port"]);
    $host_ip = @gethostbyname($url2["host"]);
    $fsock_timeout = $timeout; //超时时间
    if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
    return false;
    }
    $request = $url2["path"].($url2["query"] ? "?" . $url2["query"] : "");
    $post_data2 = http_build_query($post_data);
    $in = "POST " . $request . " HTTP/1.1 ";
    $in .= "Accept: */* ";
    $in .= "Host: " . $url2["host"] . " ";
    // $in .= "User-Agent: Lowell-Agent ";
    $in .= "Content-type: application/x-www-form-urlencoded ";
    $in .= "Content-Length: " . strlen($post_data2) . " ";
    $in .= "Connection: Close ";
    $in .= $post_data2 . " ";
    unset($post_data2);
    if(!@fwrite($fsock, $in, strlen($in))){
    @fclose($fsock);
    return false;
    }
    return self::GetHttpContent($fsock);
    }

    static public function curl($url, $data=array(), $timeout=5)
    {
    $ch = curl_init();
    if (is_array($data) && $data)
    {
    $formdata = http_build_query($data);
    curl_setopt($ch, CURLOPT_POST, true);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $formdata);
    }
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    $result = curl_exec($ch);
    curl_close($ch);
    return $result;
    }

    }//类定义结束
    ?>

  • 相关阅读:
    BZOJ 1911: [Apio2010]特别行动队 斜率优化dp
    BZOJ 2751: [HAOI2012]容易题(easy) 数学
    Wunder Fund Round 2016 (Div. 1 + Div. 2 combined) B. Guess the Permutation 水题
    Wunder Fund Round 2016 (Div. 1 + Div. 2 combined) A. Slime Combining 水题
    BZOJ 2768: [JLOI2010]冠军调查 最小割
    BZOJ 1497: [NOI2006]最大获利 最小割
    Codeforces Round #140 (Div. 1) D. The table 构造
    ICPC-CAMP day1 D.Around the world
    Codeforces Round #340 (Div. 2) E. XOR and Favorite Number 莫队算法
    BZOJ 2038 [2009国家集训队]小Z的袜子 莫队
  • 原文地址:https://www.cnblogs.com/archoncap/p/4271591.html
Copyright © 2011-2022 走看看