selenium检索代理:
import selenium import selenium.webdriver url="http://www.kuaidaili.com/free/inha/4/" driver=selenium.webdriver.Chrome() driver.get(url) driver.implicitly_wait(10) elems=driver.find_elements_by_xpath("//tbody/tr") #print(type(elems),elems) for elem in elems: print(elem.find_elements_by_xpath("./td")[0].text) print(elem.find_elements_by_xpath("./td")[1].text) driver.close()
代理验证:
import urllib.request try: httpproxy=urllib.request.ProxyHandler({"http":"10.36.132.16:808"})#代理无需账号 opener=urllib.request.build_opener(httpproxy)#创建一个打开器 request=urllib.request.Request("http://www.baidu.com/") #访问百度 response=opener.open(request,timeout=10)#打开网页,内置代理服务器 print(response.read()) print("OK") except: print("NO")
检索可以使用的代理,并保存到本地txt:
import selenium import selenium.webdriver import urllib import urllib.request import lxml import lxml.etree #测试可以,不过速度比较慢,第二个函数改成urllib会快些 def urllist(url): headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"} request=urllib.request.Request(url,headers=headers) request.add_header("Connection", "keep-alive") # 一直活着 response=urllib.request.urlopen(request) data = response.read() mytree = lxml.etree.HTML(data) numbers = mytree.xpath("//*[@id="listnav"]/ul/li[9]/a/text()") #print(numbers) numbers1=eval(numbers[0]) urllist=[] for i in range(1,numbers1+1): urllist.append(url+str(i)+"/") return urllist def textlist(url): #url="https://www.kuaidaili.com/free/inha/2/" driver=selenium.webdriver.Chrome() driver.get(url) driver.implicitly_wait(10) #控制操作的时间,在10秒之内,如果元素出现,就继续执行,元素没有出现最多10秒 #批量提取 elements 有s elems=driver.find_elements_by_xpath("//tbody/tr") dailist=[] for elem in elems: #print(elem.find_elements_by_xpath("./td")[0].text) #./td 表示当前子目录(路径)下的td 即匹配所有的td *匹配任何元素节点; //*选取文档中的所有元素 #print(elem.find_elements_by_xpath("./td")[1].text) ipnum=elem.find_elements_by_xpath("./td")[0].text kounum=elem.find_elements_by_xpath("./td")[1].text daili=ipnum+":"+kounum try: httpproxy=urllib.request.ProxyHandler({"http":daili})#代理无需账号 opener=urllib.request.build_opener(httpproxy)#创建一个打开器 request=urllib.request.Request("http://www.baidu.com/") #访问百度 response=opener.open(request,timeout=10)#打开网页,内置代理服务器 #print(response.read()) print(daili) print("OK") dailist.append(daili) return dailist except: print("NO") url="https://www.kuaidaili.com/free/inha/" savefilepath="daili.txt" savefile=open(savefilepath,"wb") for urls in urllist(url): textlist(urls) if len(textlist(urls))!= 0: dailistr = " ".join(textlist(urls)) savefile.write((dailistr + " ").encode("utf-8"))