zoukankan      html  css  js  c++  java
  • 爬虫练习:使用xpath 下载站长之家简历封面模板

    # -*- coding: utf-8 -*-
    # @Time : 2020/9/21 11:13
    # @Author : aqiong
    # @Site : 
    # @File : 站长之家简历爬取.py
    # @Software: PyCharm
    import requests
    from lxml import etree
    import random
    import os
    
    ##
    #获得用户代理
    #
    def getheaders():
        user_agent_list = ['Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
                           'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
                           'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
                           'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
                           'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36']
        return  random.choice(user_agent_list)
    
    if __name__ == '__main__':
        if not os.path.exists('./jl'):
            os.mkdir('./jl')
    
        url = 'http://sc.chinaz.com/jianli/fengmian.html'
        headers = {
            'user-agent':getheaders()
        }
        page_text = requests.get(url=url,headers = headers)
        page_text.encoding='utf-8'###这里是设置编码为utf-8,否则爬取到的数据是乱码
        page_text = page_text.text
        #print(page_text.text)
    
        page_html=etree.HTML(page_text)
        #print(page_html.xpath('//title/text()'))
    
        a_herf_list = page_html.xpath('//div[@class="main_list jl_main"]/div[@class="box col3 ws_block"]/a/@href')
    
        for a_url in a_herf_list:
            jl_page_text = requests.get(url=a_url,headers=headers,allow_redirects=False).text#allow_redirects=False当爬虫时报错:requests.exceptions.TooManyRedirects: Exceeded 30 redirects.
    
            jl_html = etree.HTML(jl_page_text)
    
    
            rar_list = jl_html.xpath('//div[@class="clearfix mt20 downlist"]//ul[@class="clearfix"]/li[1]/a/@href')#获得下载连接
            rar_url = rar_list[0]
           # print(rar_list)
            jl_rar = requests.get(url=rar_url, headers=headers).content
            fileName = './jl/' + rar_url.split('/')[-1]
            # print(fileName)
    
            with open(fileName, 'wb') as fp:
                fp.write(jl_rar)
                print(fileName + '保存成功')
    
    
        #print(a_herf_list)
    
    

  • 相关阅读:
    部署phpmyadmin登录不进去
    无法获取快照信息:锁定文件失败
    nginx: [emerg] BIO_new_file("/etc/nginx/ssl_key/server.crt") failed (SSL: error:02001002:syste
    nginx重启失败
    An error occurred (500 Error)
    Failed to set session cookie. Maybe you are using HTTP instead of HTTPS to access phpMyAdmin.
    clnt_create: RPC: Program not registered
    [error] 2230#2230: *84 client intended to send too large body: 1711341 bytes
    lnmp部署知乎出现403
    easyui下拉框过滤优化
  • 原文地址:https://www.cnblogs.com/aqiong/p/13715334.html
Copyright © 2011-2022 走看看