zoukankan      html  css  js  c++  java
  • 利用协程框架,无界面浏览器爬取上海高院开庭数据

    # -*- coding: utf-8 -*-
    """
    @author: Dell Created on Thu Jan  2 11:16:08 2020
    """
    import gevent 
    from gevent import monkey
    
    monkey.patch_all()
    
    from lxml import etree
    from selenium import webdriver
    from selenium.webdriver import PhantomJS 
    from selenium.webdriver.chrome.options import Options
    
    
    def download(url, start_idx, end_idx, file):
        #需要一个phantomjs.exe路径参数,但是高版本的selenium已经将PhantomJS废弃
        # driver = PhantomJS()
        # 实现无界面爬取,高版本
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(options=chrome_options)  
       
        try:
            driver.get(url)
            for i in range(start_idx, end_idx):#循环提取页面数据
                script = "javascript:goPage('"+str(i)+"')"
                driver.execute_script(script)#执行Javascript代码
                gevent.sleep(5)#等待页面加载完成
                print("开始解析第", i, "页")
                
                html = etree.HTML(driver.page_source)#获得的page_source是<class 'str'>
                trs = html.xpath("//table[@id='report']//tbody/tr[position()>1]")
                for tr in trs:
                    court = tr.xpath("./td[1]/font/text()")[0].strip()#法院
                    court_code = tr.xpath("./td[2]/font/text()")[0].strip()#法庭
                    lawful_day = tr.xpath("./td[3]/text()")[0].strip()#开庭日期
                    
                    code = tr.xpath("./td[4]/text()")[0].strip()#案号
                    reason = tr.xpath("./td[5]/text()")[0].strip()#案由
                    undertaking_department = tr.xpath("./td[6]/div/text()")[0].strip()#承办部门
                    
                    presiding_judge = tr.xpath("./td[7]/div/text()")[0].strip()#审判长/主审人
                    complaint = tr.xpath("./td[8]/text()")[0].strip()#原告
                    defendant = tr.xpath("./td[9]/text()")[0].strip()#被告
                    
                    print(court,court_code,lawful_day,code,reason,undertaking_department,presiding_judge,complaint,defendant)
                    line = (court,court_code,lawful_day,code,reason,undertaking_department,presiding_judge,complaint,defendant)
                    file.write((str(line)+"
    ").encode("utf-8", errors="ignore"))
                    # break
                print("共有数据:", len(trs), "条")
        except:
            print("error")
        finally:
            driver.quit()#提取完成,退出浏览器
        
    
    def main():
        url = "http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp"
        file = open("court.txt", "wb")
        #每个线程抓两页数据,协程框架
        gevent.joinall([
            gevent.spawn(download, url, 1, 100, file),    
            gevent.spawn(download, url, 100, 200, file),    
            gevent.spawn(download, url, 200, 300, file),    
            gevent.spawn(download, url, 400, 500, file),    
            gevent.spawn(download, url, 500, 600, file),    
        ])
        file.close()
         
    
    if __name__ == "__main__":
        main()
        pass
    
    
    
    
    
    
    
    
    
    
  • 相关阅读:
    JavaScript匿名函数的使用
    __construct __destory __call __get __set
    嵌入式学习
    动态加载script文件
    Android框架Volley使用:Post请求实现
    Android框架Volley使用:Get请求实现
    安卓开发笔记(三十五):Cardview的简单使用
    安卓开发笔记(三十四):Material Design框架实现优美的左侧侧滑栏
    Android APK反编译技巧全讲解
    Java数据结构(一):栈
  • 原文地址:https://www.cnblogs.com/zxfei/p/12132362.html
Copyright © 2011-2022 走看看