zoukankan      html  css  js  c++  java
  • 采集器

    <meta http-equiv=Content-Type content="text/html;charset=gbk">
    <script src="./js/jquery.js" type="text/javascript"></script>
    <script src="./js/jquery.validate.js" type="text/javascript"></script>
    <script src="./js/jquery.metadata.js" type="text/javascript"></script>
    <script type="text/javascript">
    </script>
    <?php
    $url="http://www.jy.com.cn/PreSellCert_List.do?project=%B3%A4%BD%AD%B9%FA%BC%CA";
    $str=file_get_contents($url);
    $str=compress_html($str);
    /*$str = 'http://www.youku.com/show_page/id_ABCDEFG.html';
    $matches = array();
    */
    $regex='/<span class="font_bold font_blue font_14px"><a href="PreSellCert_Detail.do?pscid=(.*)">.*(<span class="font_12px">(.*)</span>)</a></span></td>'
    .'.*<span class="font_16px font_bold">(.*)</span>套</td>.*批准日期:(.*)</td></tr>.*<span class="font_16px font_bold">(.*)</span>套</td>/U';
    //$str="adfadfadf预售许可证:123123</span>)";
    if(preg_match_all($regex, $str, $matches,PREG_SET_ORDER)){
    foreach($matches as $val){
    $saleurl="http://www.jy.com.cn/ifrm_PreSellCert_SaleStat.do?pscid=".$val[1];
    $salestr=file_get_contents($saleurl);
    //print_R($salestr);exit;
    $salestr=compress_html($salestr);
    //$regex='/<td align="right">(.*)</td>/U';
    $regex='/<tr><td align="right">(.{1,30})</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="right">(.*)</td><td align="center">(.*)</td></tr>/U';
    if(preg_match_all($regex, $salestr, $salematches,PREG_SET_ORDER)){
    print_R($salematches);exit;
    }
    }
    }

    function compress_html($string) {
    $string = str_replace(" ", '', $string); //清除换行符
    $string = str_replace(" ", '', $string); //清除换行符
    $string = str_replace(" ", '', $string); //清除制表符
    $pattern = array (
    "/> *([^ ]*) *</", //去掉注释标记
    "/[s]+/",
    "/<!--[^!]*-->/",
    "/" /",
    "/ "/",
    "'/*[^*]**/'"
    );
    $replace = array (
    ">\1<",
    " ",
    "",
    """,
    """,
    ""
    );
    return preg_replace($pattern, $replace, $string);
    }
    ?>

  • 相关阅读:
    zabbix短信网关调用问题总结
    zabbix短信接口调用
    Windows Open with Sublime Text
    [转载]windows下安装Python虚拟环境virtualenvwrapper-win
    Resilio-sync auto restart
    django-orm-standalone
    RabbitMQ笔记
    RabbitMQ启动出错:- unable to connect to epmd on xxxx: timeout (timed out)
    [Python笔记]第十六篇:web框架之Tornado
    [前端笔记]第三篇:JavaScript
  • 原文地址:https://www.cnblogs.com/hechunhua/p/3673702.html
Copyright © 2011-2022 走看看