zoukankan      html  css  js  c++  java
  • python爬虫挂代理

     以下是GET的方法,使用的代理接口网站是 http://www.xicidaili.com/nn/

    #-*- coding:utf-8 -*-
    from bs4 import BeautifulSoup
    import requests,chardet,urllib2
    
    ip_list=[]
    def get_ip_list(url, headers):
        web_data = requests.get(url, headers=headers)
        soup = BeautifulSoup(web_data.text, 'lxml')
        ips = soup.find_all('tr')
        ip_list = []
        for i in range(1, len(ips)):
            ip_info = ips[i]
            tds = ip_info.find_all('td')
            ip_list.append('http://' + tds[1].text + ':' + tds[2].text)
        return ip_list
    
    def get_random_ip(ip_list):
        proxies = {'http': ip_list[0]}
        return proxies
    
    def getip():
        global ip_list
        url = 'http://www.xicidaili.com/nn/'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
        if not ip_list:
            ip_list = get_ip_list(url, headers=headers)
        print ip_list
        proxies = get_random_ip(ip_list)
        return proxies
    
    def deleteip():
        global ip_list
        ip_list.pop(0)
    
    def urllink(link):  # 网页HTML获取以及编码转换
        for i in range(12) :
            try:
                ip = getip()
                print ip
                proxy_support = urllib2.ProxyHandler(ip)
                opener = urllib2.build_opener(proxy_support)
                urllib2.install_opener(opener)
                html_1 = urllib2.urlopen(link, timeout=10).read()
                break
            except Exception,e:
                deleteip()
                print '错误',i,e
                pass
        if i==11:
            return ''
        encoding_dict = chardet.detect(html_1)
        web_encoding = encoding_dict['encoding']
        if web_encoding == 'utf-8' or web_encoding == 'UTF-8':
            html = html_1
        else:
            html = html_1.decode('gbk', 'ignore').encode('utf-8')
        return html
    
    print urllink("http://ccdas.ipmph.com/pc/clinicalExam/getClinicalExamDetail?articleId=8165")
    

      

  • 相关阅读:
    POJ 1795 DNA Laboratory
    CodeForces 303B Rectangle Puzzle II
    HDU 2197 本源串
    HDU 5965 扫雷
    POJ 3099 Go Go Gorelians
    CodeForces 762D Maximum path
    CodeForces 731C Socks
    HDU 1231 最大连续子序列
    HDU 5650 so easy
    大话接口隐私与安全 转载
  • 原文地址:https://www.cnblogs.com/lomooo/p/7278050.html
Copyright © 2011-2022 走看看