zoukankan      html  css  js  c++  java
  • 小爬爬2:中文乱码等问题处理

    1.解决中文乱码的问题

    (1)是否动态加载,

    (2)获取源码数据

    彼岸图网:

    第一页地址:http://pic.netbian.com/4kmeinv/

    第二页:http://pic.netbian.com/4kmeinv/index_2.html

    第三页:http://pic.netbian.com/4kmeinv/index_3.html

    #第一步:我们写的下面的代码有bug,返回的中文有乱码的问题

    import requests
    from lxml import etree
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    start_page=int(input('start page num:'))
    end_page=int(input('end page num:'))
    #通用的url模板(不能修改模板)
    url='http://pic.netbian.com/4kmeinv/index_%d.html)'
    for page in range(start_page,end_page):
        if page==1:
            new_url='http://pic.netbian.com/4kmeinv/'
        else:
            new_url=format(url%page)
        page_text=requests.get(url=new_url,headers=headers).text
        #解析名称和图片的src属性值
        tree=etree.HTML(page_text)
        li_list=tree.xpath('//div[@class="slist"]/ul/li')
        for li in li_list:
            img_name=li.xpath('./a/img/@alt')[0]
            img_src=li.xpath('./a/img/@src')[0]
            print(img_name,img_src)

    #第二步:修改,下面的结果会有变化,但是结果还是存在问题

    import requests
    from lxml import etree
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    start_page=int(input('start page num:'))
    end_page=int(input('end page num:'))
    #通用的url模板(不能修改模板)
    url='http://pic.netbian.com/4kmeinv/index_%d.html)'
    for page in range(start_page,end_page):
        if page==1:
            new_url='http://pic.netbian.com/4kmeinv/'
        else:
            new_url=format(url%page)
        response=requests.get(url=new_url,headers=headers)
        response.encoding='utf-8'
        page_text=response.text
        #解析名称和图片的src属性值
        tree=etree.HTML(page_text)
        li_list=tree.xpath('//div[@class="slist"]/ul/li')
        for li in li_list:
            img_name=li.xpath('./a/img/@alt')[0]
            img_src=li.xpath('./a/img/@src')[0]
            print(img_name,img_src)

    第三步:我们进一步升级

    import requests
    from lxml import etree
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    start_page=int(input('start page num:'))
    end_page=int(input('end page num:'))
    #通用的url模板(不能修改模板)
    url='http://pic.netbian.com/4kmeinv/index_%d.html)'
    for page in range(start_page,end_page):
        if page==1:
            new_url='http://pic.netbian.com/4kmeinv/'
        else:
            new_url=format(url%page)
        response=requests.get(url=new_url,headers=headers)
    #     response.encoding='utf-8'
        page_text=response.text
        #解析名称和图片的src属性值
        tree=etree.HTML(page_text)
        li_list=tree.xpath('//div[@class="slist"]/ul/li')
        for li in li_list:
            img_name=li.xpath('./a/img/@alt')[0]
            img_name=img_name.encode('iso-8859-1').decode('gbk')
            img_src=li.xpath('./a/img/@src')[0]
            print(img_name,img_src)

    第四步,进一步升级

    import requests
    from urllib import request
    from lxml import etree
    import os
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    start_page=int(input('start page num:'))
    end_page=int(input('end page num:'))
    #通用的url模板(不能修改模板)
    
    #创建文件夹
    if not os.path.exists('./meinvs'):
        os.mkdir('./meinvs')
    url='http://pic.netbian.com/4kmeinv/index_%d.html'  #这个跳转的原始页码要看好.
    for page in range(start_page,end_page+1):
        if page==1:
            new_url='http://pic.netbian.com/4kmeinv/'
        else:
            new_url=format(url%page)
        response=requests.get(url=new_url,headers=headers)
    #     response.encoding='utf-8'
        page_text=response.text
        #解析名称和图片的src属性值
        tree=etree.HTML(page_text)
        li_list=tree.xpath('//div[@class="slist"]/ul/li')
        for li in li_list:
            img_name=li.xpath('./a/img/@alt')[0]
            img_name=img_name.encode('iso-8859-1').decode('gbk')+'.jpg'
            img_src='http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
            #print('img_src',img_src)
            img_path='./meinvs/'+img_name         #这个路径拼接需要注意下
            request.urlretrieve(img_src,img_path)
            print(img_name,'下载成功!!!')

    2.XPATH的另一种用法

    爬取全国城市名称

    url = 'https://www.aqistudy.cn/historydata/'

    import requests
    from lxml import etree
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    url = 'https://www.aqistudy.cn/historydata/'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    # tree.xpath('//div[@class="bottom"]/ul/li/a/text()')    #热门城市
    #tree.xpath('//div[@class="bottom"]/ul/li/a/text()')
    #all_city = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()')
    #all_city    #一列表形式,打印全部城市
    
    #拿取所有的数据,按位或,xpath直接获取
    tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')

    站长素材里边的"简历模板"

    http://sc.chinaz.com/jianli/

    下载地址可以换着用,解析的时候用每个地址.

     

  • 相关阅读:
    爬虫之Selenium库
    爬虫之pyquery库
    爬虫之BeautifulSoup库
    爬虫之Requests库
    爬虫之Re库
    在Flask中使用Celery
    Celery-分布式任务队列
    MongoDB
    Redis Cluster
    如何使用mongo shell
  • 原文地址:https://www.cnblogs.com/studybrother/p/10941643.html
Copyright © 2011-2022 走看看