zoukankan      html  css  js  c++  java
  • 爬取拉钩网上所有的python职位

    # 2.爬取拉钩网上的所有python职位。
    
    from urllib import request,parse
    import json,random
    
    def user_agent(page):
        #浏览器列表,每次访问可以用不同的浏览器访问
        user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
        ]
        #随机选取一个浏览器访问
        user_agent = random.choice(user_agent_list)
        #调用拉钩函数
        lagou(page,user_agent)
    
    def lagou(page,user_agent):
        #职位请求地址
        base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0"
        #判断是否是第一次访问,第二次访问data的值不一样
        if page == 1:
            first = 'true'
        else:
            first = 'false'
        data = {
            'first':first,
            'pn':page,
            'kd':'python'
        }
        #参数拼接及转码,生成是字符串格式,  注意:长度下面的headers用的到
        data = parse.urlencode(data)
        #一定要比较每次page不一样的时候headers的各项的细微差别  这个很重要 也是能否爬取数据的关键
        #在这里Content-Length,User-Agent的值相对来说比较重要
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            # Accept-Encoding:gzip, deflate
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Content-Length': len(data),
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'user_trace_token=20180310205250-ccfd21f6-5b57-4e04-b90c-5e547e18d391; LGUID=20180310205255-f3afa6e4-2461-11e8-a8b5-525400f775ce; hideSliderBanner20180305WithTopBannerC=1; X_HTTP_TOKEN=673c8ae0b29d830c65e9812a6aeeb211; ab_test_random_num=0; JSESSIONID=ABAAABAAADEAAFI0BD8484557BF60A48BF2BDD6AA4C5D33; _putrc=318C0D90043747B6123F89F2B170EADC; login=true; unick=%E5%BC%A0%E6%B3%A2; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=d46c3e3008cb0364e7b47d9d261956a39273c72d679a1b0eb644e03620c100fa; TG-TRACK-CODE=index_navigation; _gid=GA1.2.1883607132.1520686376; _ga=GA1.2.2068283511.1520686375; LGSID=20180310215122-1e408aca-246a-11e8-a8ed-525400f775ce; LGRID=20180310233852-22b0d3ee-2479-11e8-a921-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520686378,1520689884; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520696337; SEARCH_ID=458b8d44186948ceb472c3d662f08528; index_location_city=%E5%8C%97%E4%BA%AC',
            'Host': 'www.lagou.com',
            'Origin': 'https://www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=',
            "User-Agent": user_agent,
            'X-Anit-Forge-Code': 0,
            'X-Anit-Forge-Token': 'None',
            'X-Requested-With': 'XMLHttpRequest'
        }
        req = request.Request(url=base_url,data=bytes(data,encoding='utf-8'),headers=headers)
        response = request.urlopen(req)
        html = response.read()
        html = html.decode('utf-8')
    
        #使用json格式化,生成一个字典,然后从字典里头取值就可以,下面就是取值的过程,想要啥就可以啥
        json_data = json.loads(html)
        # print(json_data)
        positionResult = json_data['content']['positionResult']
        # print(positionResult)
        result_list = positionResult['result']
        # print(result_list)
        for result in result_list:
            print(len(result))
            companyFullName = result['companyFullName']
            positionName = result['positionName']
    
            print(positionName,companyFullName)
    
        with open('lagou.html','a',encoding='utf-8') as f:
            f.write(str(result_list))
        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    
    if __name__ == '__main__':
        # for page in range(1,31):
        #     user_agent(page)
        user_agent(1)
  • 相关阅读:
    python读取数据写入excel
    English Study!
    ODOO里视图开发案例---定义一个像tree、form一样的视图
    更改gradle中央仓库,加快访问速度
    hadoop解决集群启动时某个slave的datanode挂掉问题
    ssh免密登录
    大数据集群脚本xcall和xsync
    虚拟机启动后黑屏并无法关闭
    快照与克隆的区别(来自转载)
    VMware12 打不开Centos6.8系统
  • 原文地址:https://www.cnblogs.com/zhangboblogs/p/8545353.html
Copyright © 2011-2022 走看看