zoukankan      html  css  js  c++  java
  • python访问网站

    #!/usr/bin/env python  
    # encoding: utf-8  
    from functools import wraps
    import requests
    from lxml import html
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time
    import random
    
    first_num = random.randint(55, 62)
    third_num = random.randint(0, 3200)
    fourth_num = random.randint(0, 140)
    
    
    class FakeChromeUA:
        os_type = [
                    '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
                    '(Macintosh; Intel Mac OS X 10_12_6)'
                   ]
    
        chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
    
        @classmethod
        def get_ua(cls):
            return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36',
                             '(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36']
                            )
    
    
    HEADERS = {
        'User-Agent': FakeChromeUA.get_ua(),
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Connection': 'keep-alive'
    }
    
    URL="https://www.taobao.com/"
    MAX_RETRY=3  #最大尝试次数
    XPATH="//div[@class='cat-title']"  #需要检查的xpath
    def request(url):
        session=requests.Session()
        req=session.get(url,headers=HEADERS)
        if req.status_code==requests.codes.ok:
            req.encoding=req.apparent_encoding
            return req.text
        return None
    
    
    def getdriver(url):
        co=Options()
        prefs = {
            'profile.default_content_setting_values': {
                'images': 2
            }
        }
        co.add_experimental_option('prefs', prefs)
        co.add_argument('lang=zh_CN.UTF-8')
        co.add_argument('--headless')
        co.add_argument('--nogpu')
        driver=webdriver.Chrome(chrome_options=co)
        driver.get(url)
        time.sleep(3)
        source=driver.page_source
        time.sleep(3)
        print("关闭chrome浏览器")
        driver.close()
        return source
    def newdecorator(url,retry,check_xpath):
      def decorator(func):
          @wraps(func)
          def log(*args,**kwargs):
              global retry
              retry=1
              try:
                  while retry<3:
                     source=request(url)
                     if source:
                         print("开启requests模块")
                         print("=" * 50)
                         root=html.fromstring(source)
                         nodelist=root.xpath(check_xpath)
                         if nodelist:
                             return func(source)
                         else:
                             print("该网站为ajax生成的网页,开始启用chrome模式")
                             try:
                                source=getdriver(url)
                             except:
                                 print("获取内容失败,再次启动谷歌浏览器")
                                 source = getdriver(url)
                             break
                     else:
                         retry+=1
                  return func(source)
              except Exception as e:
                  print(e.args)
          return log
      return decorator
    
    @newdecorator(url=URL,retry=MAX_RETRY,check_xpath=XPATH)
    def getitem(source):
        root=html.fromstring(source)
        nodes=root.xpath(XPATH)
        print("="*50)
        print("开始解析网页")
        print("=" * 50)
        print("获取商品分类")
        for item in nodes:
            name=item.xpath(".//text()")
            print(name[1])
    if __name__ == '__main__':
        getitem()
    

      

  • 相关阅读:
    解决IllegalStateException: Can not perform this action after onSaveInstanceState
    Android自定义控件实战——仿淘宝商品浏览界面
    实现类似于QQ空间相册的点击图片放大,再点后缩小回原来位置
    新浪通过API分享 实践
    Android中集成QQ登陆和QQ好友分享及QQ空间分享
    Android 微信分享,分享到朋友圈与分享到好友,以及微信登陆
    interface Impl
    Spring 4 官方文档学习(十一)Web MVC 框架之编码式Servlet容器初始化
    Spring 4 官方文档学习(十一)Web MVC 框架之HTTP caching support
    Spring 4 官方文档学习(十一)Web MVC 框架之约定优于配置
  • 原文地址:https://www.cnblogs.com/c-x-a/p/9106064.html
Copyright © 2011-2022 走看看