zoukankan html css js c++ java

【python】前程无忧51job岗位招聘信息爬虫程序，自动翻页，进阶版

数据挖掘的学习过程中一定绕不开的一个阶段性实践项目——前程无忧51job岗位招聘信息爬虫程序！

搞定这个之后可以尝试带有一定反爬机制的爬虫实践，比如需要登陆服务器才能进一步响应的网站，比如隔一段时间就弹出验证码的网站，有兴趣的可以去尝试一下哈！

# -*- coding:utf-8 -*-
import urllib
import re, codecs
import time, random
import requests
from lxml import html
from urllib import parse
 
key = 'python'
key = parse.quote(parse.quote(key))
headers = {'Host': 'search.51job.com',
           'Upgrade-Insecure-Requests': '1',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
 
 
def get_links(page):
    url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(page) + '.html'
    r = requests.get(url, headers, timeout=10)
    s = requests.session()
    s.keep_alive = False
    r.encoding = 'gbk'
    reg = re.compile(r'class="t1 ">.*? <a target="_blank" title=".*?" href="(.*?)".*? <span class="t2">.*?<span class="t4">(.*?)</span>', re.S)
    links = re.findall(reg, r.text)
    return links
 
 
# 多页处理，下载到文件
def get_content(link,salary):
    r1 = requests.get(link, headers, timeout=10)
    s = requests.session()
    s.keep_alive = False
    r1.encoding = 'gbk'
    t1 = html.fromstring(r1.text)
    l=[]
    try:
        job = t1.xpath('//div[@class="tHeader tHjob"]//h1/text()')[0]
        company = t1.xpath('//p[@class="cname"]/a/text()')[0]
        label = t1.xpath('//div[@class="t1"]/span/text()')
        education = t1.xpath('//div[@class="cn"]/p[2]/text()')[2]
        salary = salary
        area = t1.xpath('//div[@class="cn"]/p[2]/text()')[0]
        companytype = 'Null'
        workyear = t1.xpath('//div[@class="cn"]/p[2]/text()')[1]
        describe = re.findall(re.compile(r'<div class="bmsg job_msg inbox">(.*?)任职要求', re.S), r1.text)
        require = re.findall(re.compile(r'<div class="bmsg job_msg inbox">.*?任职要求(.*?)<div class="mt10">', re.S),
                             r1.text)
        try:
            file = codecs.open('51job4.xls', 'a+','utf-8')
            item = str(company) + '	' + str(job) + '	' + str(education) + '	' + str(label) + '	' + str(
                salary) + '	' + str(companytype) + '	' + str(workyear) + '	' + str(area) + '	' + str(
                workyear) + str(describe) + '	' + str(require) + '
'
            file.write(item)
            file.close()
            return True
        except Exception as e:
            print(e)
            return None
        # output='{},{},{},{},{},{},{},{}
'.format(company,job,education,label,salary,area,describe,require)
        # with open('51job.csv', 'a+', encoding='utf-8') as f:
        # f.write(output)
    except:
        print('None')
    print(l)
    return l
 
 
for i in range(1, 750):
    print('正在爬取第{}页信息'.format(i))
    try:
        time.sleep(random.random()+random.randint(1,5))
        links = get_links(i)
        for link in links:
            url=link[0]
            salary=link[1]
            get_content(url,salary)
            time.sleep(random.random()+random.randint(0,1))
    except:
        continue
        print('有点问题')

查看全文

相关阅读:
Linux文件的复制、删除和移动命令
 Linux文件夹文件创建、删除
 Python 常用代码片段
 Chrome 插件 PageSpeed Insights
VI打开和编辑多个文件的命令
 Linux case 及函数位置参数
 C#编程利器之三:接口(Interface)
C#编程利器之四:委托与事件(Delegate and event)
解读设计模式简单工厂模式(SimpleFactory Pattern),你要什么我就给你什么
 C#编程利器之五:集合对象(Collections)

原文地址：https://www.cnblogs.com/helenlee01/p/12617447.html