zoukankan      html  css  js  c++  java
  • python爬虫挂代理

     以下是GET的方法,使用的代理接口网站是 http://www.xicidaili.com/nn/

    #-*- coding:utf-8 -*-
    from bs4 import BeautifulSoup
    import requests,chardet,urllib2
    
    ip_list=[]
    def get_ip_list(url, headers):
        web_data = requests.get(url, headers=headers)
        soup = BeautifulSoup(web_data.text, 'lxml')
        ips = soup.find_all('tr')
        ip_list = []
        for i in range(1, len(ips)):
            ip_info = ips[i]
            tds = ip_info.find_all('td')
            ip_list.append('http://' + tds[1].text + ':' + tds[2].text)
        return ip_list
    
    def get_random_ip(ip_list):
        proxies = {'http': ip_list[0]}
        return proxies
    
    def getip():
        global ip_list
        url = 'http://www.xicidaili.com/nn/'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
        if not ip_list:
            ip_list = get_ip_list(url, headers=headers)
        print ip_list
        proxies = get_random_ip(ip_list)
        return proxies
    
    def deleteip():
        global ip_list
        ip_list.pop(0)
    
    def urllink(link):  # 网页HTML获取以及编码转换
        for i in range(12) :
            try:
                ip = getip()
                print ip
                proxy_support = urllib2.ProxyHandler(ip)
                opener = urllib2.build_opener(proxy_support)
                urllib2.install_opener(opener)
                html_1 = urllib2.urlopen(link, timeout=10).read()
                break
            except Exception,e:
                deleteip()
                print '错误',i,e
                pass
        if i==11:
            return ''
        encoding_dict = chardet.detect(html_1)
        web_encoding = encoding_dict['encoding']
        if web_encoding == 'utf-8' or web_encoding == 'UTF-8':
            html = html_1
        else:
            html = html_1.decode('gbk', 'ignore').encode('utf-8')
        return html
    
    print urllink("http://ccdas.ipmph.com/pc/clinicalExam/getClinicalExamDetail?articleId=8165")
    

      

  • 相关阅读:
    三类设计模式UML图
    有一种面试叫-----别人的面试
    reference to 'map' is ambiguous|
    多个if语句和else if区别
    n&m位运算
    边缘填充算法
    C# 换行
    优秀程序员必须知道的32个算法,提高你的开发效率
    作业四(不算寒假了吧)
    寒假作业三
  • 原文地址:https://www.cnblogs.com/lomooo/p/7278050.html
Copyright © 2011-2022 走看看