zoukankan      html  css  js  c++  java
  • 爬虫扩展

    爬取西刺代理

    爬虫 + 网站 --》 代理 解析

    from bs4 import BeautifulSoup
    import requests
    import http.client
    import threading
    
    inFile = open('proxy.txt')     # 所有爬到的代理
    outFile = open('verified.txt', 'w')   # 所有可用代理
    lock = threading.Lock()
    

    爬取

    def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
        countNum = 0
        proxyFile = open('proxy.txt', 'a')
    
        requestHeader = {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    
        for page in range(1, 10):    # 爬取的Page数量
            url = targeturl + str(page)
            # print url
            req = requests.get(url, headers=requestHeader)
            html_doc = req.text
    
            soup = BeautifulSoup(html_doc, "html.parser")
            # print soup
            trs = soup.find('table', id='ip_list').find_all('tr')
            for tr in trs[1:]:
                tds = tr.find_all('td')
                # 国家
                if tds[0].find('img') is None:
                    nation = '未知'
                    locate = '未知'
                else:
                    nation = tds[0].find('img')['alt'].strip()
                    locate = tds[3].text.strip()
                ip = tds[1].text.strip()
                port = tds[2].text.strip()
                anony = tds[4].text.strip()
                protocol = tds[5].text.strip()
                speed = tds[6].find('div')['title'].strip()
                time = tds[8].text.strip()
    
                proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s
    ' % (nation, ip, port, locate, anony, protocol, speed, time))
                print('%s=%s:%s' % (protocol, ip, port))
                countNum += 1
    
        proxyFile.close()
        return countNum
    

    验证:

    def verifyProxyList():
        '''
        验证代理的有效性
        '''
        requestHeader = {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
        myurl = 'http://www.baidu.com/'
    
        while True:
            lock.acquire()
            ll = inFile.readline().strip()
            lock.release()
            if len(ll) == 0: break
            line = ll.split('|')
            protocol = line[5]
            ip = line[1]
            port = line[2]
    
            try:
                conn = http.client.HTTPConnection(ip, port, timeout=5.0)
                conn.request(method='GET', url=myurl, headers=requestHeader)
                res = conn.getresponse()
                lock.acquire()
                print("+++Success:" + ip + ":" + port)
                outFile.write(ll + "
    ")
                lock.release()
            except:
                print("---Failure:" + ip + ":" + port)
    

    执行:

    if __name__ == '__main__':
        tmp = open('proxy.txt', 'w')
        tmp.write("")
        tmp.close()
    
        proxynum = getProxyList("http://www.xicidaili.com/nn/")
        print(u"国内高匿:" + str(proxynum))
        proxynum = getProxyList("http://www.xicidaili.com/nt/")
        print(u"国内透明:" + str(proxynum))
        proxynum = getProxyList("http://www.xicidaili.com/wn/")
        print(u"国外高匿:" + str(proxynum))
        proxynum = getProxyList("http://www.xicidaili.com/wt/")
        print(u"国外透明:" + str(proxynum))
    
        print(u"
    验证代理的有效性:")
    
        all_thread = []
        for i in range(30):
            t = threading.Thread(target=verifyProxyList)   # 爬取的线程数
            all_thread.append(t)
            t.start()
    
        for t in all_thread:
            t.join()
    
        inFile.close()
        outFile.close()
        print("All Done.")
    
    
  • 相关阅读:
    使用静态全局对象自动做初始化与清理工作
    ThinkpadR617755BH1安装Mac Leopard10.5.2
    ubuntu常用快捷键
    linux常用命令
    c++对象内存模型【内存对齐】
    将ubuntu引导项加入windowsXP启动菜单中
    ISO C++委员会批准C++0x最终草案
    图片转eps格式
    Latex 点滴记录
    我是一个硬盘
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12806951.html
Copyright © 2011-2022 走看看