zoukankan      html  css  js  c++  java
  • 爬取51job招聘信息(一)

    目标,将网页上的内容爬取下来,并实现翻页,存储为csv。

    import os
    from concurrent.futures.thread import ThreadPoolExecutor
    from threading import Thread
    
    import requests
    from re import findall
    from json import loads
    import time
    import pymysql
    from multiprocessing import Queue

    import csv
    # 获取每页的内容,定义一个函数
    def get_one_page(page, city_code='000000'):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36'
        }
        url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        response = requests.get(url, headers=headers)
    
        if response.status_code == 200:
            json_data = findall(r'window.__SEARCH_RESULT__s*=s*({.+?})</script>', response.text)[0]
            return loads(json_data)['engine_search_result']
        else:
            print('请求失败!')
    # 需要多少页!
    start_page=1
    ts=[]
    for i in range(10):
        result = get_one_page(start_page)
        if not result:
            print('没有更多数据')
            break
        ts.append(result)
        start_page += 1
    #data_1 = get_one_page(1) #尝试保存一页的内容
    data_1=[] # 创建空列表,用于存储多页
    
    
    for i in range(len(ts)):
        for j in range(50):#一页50条
            data_1.append(ts[i][j])
    # 我需要存储的信息
    
    jobs = []
    for job in data_1:
        job_info = [job.get('job_name'),
                   job.get('providesalary_text'),
                   job.get('company_name'),
                   job.get('companytype_text'),
                   job.get('workarea_text'),
                   '-'.join(job.get('attribute_text', ['-', '-', '-', '-', '-'])),
                   job.get('jobwelf')
                   ]
        jobs.append(job_info)
    name=['job_name','providesalary_text','company_name','companytype_text','workarea_tex','attribute_text','jobwelf']
    test=pd.DataFrame(columns=name,data=jobs)
    test.to_csv("testcsv.csv") # 保存为csv格式
    test.info()
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 500 entries, 0 to 499
    Data columns (total 7 columns):
     #   Column              Non-Null Count  Dtype 
    ---  ------              --------------  ----- 
     0   job_name            500 non-null    object
     1   providesalary_text  500 non-null    object
     2   company_name        500 non-null    object
     3   companytype_text    500 non-null    object
     4   workarea_tex        500 non-null    object
     5   attribute_text      500 non-null    object
     6   jobwelf             500 non-null    object
    dtypes: object(7)
    memory usage: 27.5+ KB


    重要参考:https://gitee.com/wenhaha8/job51_analysis
  • 相关阅读:
    利用django form 模块处理post请求
    linux 下安装JDK
    java常用日期操作方法
    Git常见命令整理
    使用Java实现八种基本排序
    java验证类ValidUtils
    封装一个既能遍历数组又能遍历对象的的forEach函数
    结合canvas做雨滴特效
    前端常用js脚本
    canvas 视频音乐播放器
  • 原文地址:https://www.cnblogs.com/Cookie-Jing/p/15149865.html
Copyright © 2011-2022 走看看