zoukankan      html  css  js  c++  java
  • python 爬取网页内的代理服务器列表(需调整优化)

     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 # @Date    : 2017-08-30 20:38:23
     4 # @Author  : EnderZhou (zptxwd@gmail.com)
     5 # @Link    : http://www.cnblogs.com/enderzhou/
     6 # @Version : $Id$
     7 
     8 import requests
     9 from bs4 import BeautifulSoup as bs
    10 
    11 # 这种爬取网页内容中的列表的方式复用性差,不同的网站需要针对性的修改。每次使用均需要填写更换header头。后续将编写适用性更强的版本。
    12 
    13 url = 'http://www.kuaidaili.com/free/inha/'
    14 
    15 headers = {
    16 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    17 'Accept-Encoding':'gzip, deflate',
    18 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
    19 'AlexaToolbar-ALX_NS_PH':'AlexaToolbar/alx-4.0.1',
    20 'Cache-Control':'max-age=0',
    21 'Connection':'keep-alive',
    22 'Cookie':'yd_cookie=a0d0f393-2812-44d0b1453fbf740f3ce870820ada37151e8c; _ydclearance=dd0b3de069ce8a768712e248-d97e-4bd9-8284-f2ef598da35b-1504104455; channelid=0; sid=1504099004948599; _ga=GA1.2.742898386.1504074603; _gid=GA1.2.583101265.1504074603; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1504074603,1504097260; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1504099719',
    23 'Host':'www.kuaidaili.com',
    24 'Upgrade-Insecure-Requests':'1',
    25 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    26 }
    27 
    28 def proxy_check(types,ip,port):
    29     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
    30     url = 'http://www.whatismyip.com.tw/'
    31     proxy = {}
    32     proxy[types.lower()] = '%s:%s' % (ip,port)
    33     print proxy
    34     try:
    35         r = requests.get(url,headers=headers,proxies=proxy)
    36         soup = bs(r.content,'html.parser')
    37         chack_ip = soup.find_all(name='b')
    38         print chack_ip[0].string+':'+port
    39     except Exception,e:
    40         # print e
    41         pass
    42 
    43 # proxy_check('http','183.62.11.242','8088')#可用于测试代理验证模块
    44 
    45 def main():
    46     r = requests.get(url=url,headers=headers)
    47     tr_soup = bs(r.content,'html.parser')
    48     tr = tr_soup.find_all(name='tr')
    49     for i in tr:
    50         # print i
    51         td_soup = bs(str(i),'html.parser')
    52         td = td_soup.find_all(name='td')
    53         if len(td) != 0:
    54             ip = str(td[0].string)
    55             port = str(td[1].string)
    56             types = str(td[3].string)
    57             proxy_check(types,ip,port)
    58 
    59 if __name__ == '__main__':
    60     main()
  • 相关阅读:
    不可小视视图对效率的影响力
    Maximum Margin Planning
    PhysicsBased Boiling Simulation

    Learning Behavior Styles with Inverse Reinforcement Learning
    Simulating Biped Behaviors from Human Motion Data
    Nearoptimal Character Animation with Continuous Control
    Apprenticeship Learning via Inverse Reinforcement Learning
    回报函数学习的学徒学习综述
    Enabling Realtime Physics Simulation in Future Interactive Entertainment
  • 原文地址:https://www.cnblogs.com/enderzhou/p/7455524.html
Copyright © 2011-2022 走看看