zoukankan      html  css  js  c++  java
  • 爬取拉勾网信息

    <p>---恢复内容结束---</p>

    !/usr/bin/env python3

    -- coding:utf-8 --

    import requests
    import json
    from random import randint, choice
    import pymongo
    from time import sleep
    from multiprocessing import Process, JoinableQueue as Queue

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options

    采集公司信息

    def company(q):
    # br = get_chrome()
    br = login_lagou(20)
    br.set_window_rect(602, 0, 600, 800)
    db = get_mongodb()
    # 采集到的数据放到company表中
    company = db.company1

    while True:
        if q.empty():
            break
        try:
            company_id = q.get()
            url = 'https://www.lagou.com/gongsi/' + str(company_id) + '.html'
            br.get(url)
            company_info = {}
            company_info['name'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/h1/a').text
            company_info['job_num'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[1]/strong').text
            company_info['efficiency'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[2]/strong').text
            company_info['time_consuming'] = br.find_element_by_xpath(
                '/html/body/div[3]/div/div/div[2]/ul/li[3]/strong').text
            company_info['last_login'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[5]/strong').text
            company_info['introduction'] = br.find_element_by_xpath(
                '/html/body/div[6]/div[1]/div/div[2]/div[2]/div[2]/span[1]').text
            company_info['inancing'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[2]/span').text
            company_info['scale'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[3]/span').text
            company.insert(company_info)
    
            # 采集公司的评价
            # company_comment(company_id, db, br)
            print('%d公司信息已采集入库' % company_id)
            q.task_done()
        except Exception as e:
            print('遇到异常', e)
            q.put(company_id)
            sleep(10)
        sleep(1)
    br.close()
    q.join()
    

    采集公司的评论信息

    def company_comment(id, db, br):

    """

    :param id: 公司id

    :param db: 数据库

    :param br: 浏览器驱动

    :return:

    """

    url = 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId='+str(id)

    br.get(url)

    采集工作的具体信息

    def work(q):
    # br = get_chrome()
    br = login_lagou(20)
    br.set_window_rect(101, 0, 600, 600)
    db = get_mongodb()
    job_table = db.job1
    while True:
    if q.empty():
    break
    try:
    id = q.get()
    url = 'https://www.lagou.com/jobs/' + str(id) + '.html'
    br.get(url)
    job = {'id': id}
    content = br.find_element_by_xpath('/html/body/div[5]/div[1]/dl[1]/dd[2]').text
    job['content'] = content

            job_table.insert(job)
            print('%d招聘启事具体内容已入库' % id)
            q.task_done()
        except Exception as e:
            print('遇到异常', e)
            q.put(id)
            sleep(10)
        sleep(1)
    br.close()
    q.join()
    

    获得一个无界面浏览器驱动

    def get_chrome():
    options = Options()
    # options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    br = webdriver.Chrome(chrome_options=options)
    return br

    获取一个mongodb连接对象

    def get_mongodb():
    # 连接mongodb
    cli = pymongo.MongoClient(host='192.168.12.244', port=27017)
    db = cli.xxx
    db.authenticate('ss', '123456')
    return db

    获取cookie信息

    def get_cookie(br):
    # br = get_chrome()
    br.get('https://www.lagou.com/')
    tmp_cookies = br.get_cookies()
    # 动态获取到cookies
    return {i['name']: i['value'] for i in tmp_cookies}

    def login_lagou(sec):
    br = get_chrome()
    br.get('https://www.lagou.com/frontLogin.do')
    mobi = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[1]/input')
    pwd = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[2]/input')
    mobi.send_keys('15324818121')
    pwd.send_keys('123456')
    sleep(sec)
    # 打开连个选项卡备用
    # br.execute_script('window.open("https://www.lagou.com/")')
    # br.execute_script('window.open("https://www.lagou.com/")')
    # br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[5]/input').click()

    return br
    

    if name == 'main':
    # 存放公司信息的队列
    companies = Queue()
    # 存放岗位信息的队列
    jobs = Queue()

    br = login_lagou(20)
    br.set_window_rect(0, 0, 200, 600)
    
    # 启动一个进程采集公司的信息
    c = Process(target=company, args=(companies,))
    c.start()
    sleep(20)
    # 启动一个进程采集岗位信息
    jo = Process(target=work, args=(jobs,))
    jo.start()
    
    # 准备请求头信息
    header = {
        'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%85%A8%E5%9B%BD',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    }
    
    UAs = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:65.0) Gecko/20100101 Firefox/65.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7'
    ]
    
    cookies = get_cookie(br)
    
    
    
    # 获取一个mongodb连接对象
    db = get_mongodb()
    
    # 采集职位信息
    i = 1
    header['User-Agent'] = choice(UAs)
    
    while True:
        if i > 30:
            break
        data = {'first': 'false', 'pn': i, 'kd': 'Python'}
        re = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false',
                           data=data, headers=header, cookies=cookies)
        content = json.loads(re.text)
    
        try:
            if content['success']:
                # 将职位信息放到mongodb的work表中
                tab = db.work1
                resultData = content['content']['positionResult']['result']
                tab.insert(resultData)
    
                for j in resultData:
                    # 将该公司的id放到 公司队j列中
                    companies.put(j['companyId'])
                    # 把工作id放到队列中
                    jobs.put(j['positionId'])
        except Exception as e:
            # 采集遇到异常的话就抛出异常病退出循环
            print('遇到异常', e, content)
            # 获取最新的cookie
            cookies = get_cookie(br)
            header['User-Agent'] = choice(UAs)
            print('更换身份,正在重试')
            # 因为后面会进行加一操作,而我们这次并没有成功猜到,那么需要重新采集
            i -= 1
    
        sleep_time = randint(1, 3)
        print('列表第%d页已完成, 打算睡%d秒' % (i, sleep_time), )
        sleep(sleep_time)
        i += 1
    # companies.put(None)
    # jobs.put(None)
    br.close()
    
    c.join()
    jo.join()
    
  • 相关阅读:
    Documents
    gitlab 安装和配置
    git相关知识
    马俊龙ansible教程分享
    源码安装python 报错,openssl: error while loading shared libraries: libssl.so.1.1
    jumpserver 常见错误解决
    nginx 定义:响应头和请求头
    gcc入门(下)
    gcc入门(上)
    awk命令
  • 原文地址:https://www.cnblogs.com/imshun/p/10513049.html
Copyright © 2011-2022 走看看