协程提取上海人民法院网站信息:
import gevent import gevent.monkey import selenium import selenium.webdriver import urllib.request from bs4 import BeautifulSoup import time gevent.monkey.patch_all() #自动切换 def download(url,start,end,file): driver = selenium.webdriver.PhantomJS(executable_path=r"D:python爬虫视频爬虫代码seleniumTextphantomjs-2.1.1-windowsinphantomjs.exe") driver.get(url) gevent.sleep(10) #0,100 ;100-200 ;200-300 try: for i in range(start,end): js="javascript:goPage('"+str(i)+"')" driver.execute_script(js) # 共有1175页 print("js is run",i) gevent.sleep(10) #循环提取页面 #提取页面的数据 soup = BeautifulSoup(driver.page_source, "lxml") # 解析数据 table = soup.find("table", attrs={"id": "report"}) # attrs 为属性 trs = table.find("tr").find_next_siblings() # .find_next_siblings() 从下一个查找 for tr in trs: tds = tr.find_all("td") linestr="" #拼合数据 for td in tds: linestr+=td.text linestr+=" # " #分割 好做切割 linestr += " " print(linestr) file.write(linestr.encode("utf-8",errors="ignore")) #写入保存文件 except: pass driver.quit() url="http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc" file=open("save.text","wb") #保存文件 #创建一个列表,加载多个任务, #gevent.joinall 表示全部加入协程 gevent.joinall([gevent.spawn(download,url,0,235,file), #0,2 设置分配抓取页数 gevent.spawn(download,url,235,470,file), #file 为共享一个文件 gevent.spawn(download,url,470,705,file), gevent.spawn(download,url,705,940,file), gevent.spawn(download,url,940,1175,file), ]) file.close()