zoukankan html css js c++ java

Python_爬虫项目

1、爬虫——智联招聘信息搜集原文链接

  1 #-*- coding: utf-8 -*-
  2 import re
  3 import csv
  4 import requests
  5 from tqdm import tqdm
  6 from urllib.parse import urlencode
  7 from requests.exceptions import RequestException
  8 
  9 def get_one_page(city, keyword, region, page):
 10     '''
 11     获取网页html内容并返回
 12     '''
 13     paras = {
 14         'jl': city,         # 搜索城市
 15         'kw': keyword,      # 搜索关键词
 16         'isadv': 0,         # 是否打开更详细搜索选项
 17         'isfilter': 1,      # 是否对结果过滤
 18         'p': page,          # 页数
 19         're': region        # region的缩写，地区，2005代表海淀
 20     }
 21 
 22     headers = {
 23         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
 24         'Host': 'sou.zhaopin.com',
 25         'Referer': 'https://www.zhaopin.com/',
 26         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 27         'Accept-Encoding': 'gzip, deflate, br',
 28         'Accept-Language': 'zh-CN,zh;q=0.9'
 29     }
 30 
 31     url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
 32     try:
 33         # 获取网页内容，返回html数据
 34         response = requests.get(url, headers=headers)
 35         # 通过状态码判断是否获取成功
 36         if response.status_code == 200:
 37             return response.text
 38         return None
 39     except RequestException as e:
 40         return None
 41 
 42 def parse_one_page(html):
 43     '''
 44     解析HTML代码，提取有用信息并返回
 45     '''
 46     # 正则表达式进行解析
 47     pattern = re.compile('<a style=.*? target="_blank">(.*?)</a>.*?'        # 匹配职位信息
 48         '<td class="gsmc"><a href="(.*?)" target="_blank">(.*?)</a>.*?'     # 匹配公司网址和公司名称
 49         '<td class="zwyx">(.*?)</td>', re.S)                                # 匹配月薪
 50 
 51     # 匹配所有符合条件的内容
 52     items = re.findall(pattern, html)
 53 
 54     for item in items:
 55         job_name = item[0]
 56         job_name = job_name.replace('<b>', '')
 57         job_name = job_name.replace('</b>', '')
 58         yield {
 59             'job': job_name,
 60             'website': item[1],
 61             'company': item[2],
 62             'salary': item[3]
 63         }
 64 
 65 def write_csv_file(path, headers, rows):
 66     '''
 67     将表头和行写入csv文件
 68     '''
 69     # 加入encoding防止中文写入报错
 70     # newline参数防止每写入一行都多一个空行
 71     with open(path, 'a', encoding='gb18030', newline='') as f:
 72         f_csv = csv.DictWriter(f, headers)
 73         f_csv.writeheader()
 74         f_csv.writerows(rows)
 75 
 76 def write_csv_headers(path, headers):
 77     '''
 78     写入表头
 79     '''
 80     with open(path, 'a', encoding='gb18030', newline='') as f:
 81         f_csv = csv.DictWriter(f, headers)
 82         f_csv.writeheader()
 83 
 84 def write_csv_rows(path, headers, rows):
 85     '''
 86     写入行
 87     '''
 88     with open(path, 'a', encoding='gb18030', newline='') as f:
 89         f_csv = csv.DictWriter(f, headers)
 90         f_csv.writerows(rows)
 91 
 92 def main(city, keyword, region, pages):
 93     '''
 94     主函数
 95     '''
 96     filename = 'zl_' + city + '_' + keyword + '.csv'
 97     headers = ['job', 'website', 'company', 'salary']
 98     write_csv_headers(filename, headers)
 99     for i in tqdm(range(pages)):
100         '''
101         获取该页中所有职位信息，写入csv文件
102         '''
103         jobs = []
104         html = get_one_page(city, keyword, region, i)
105         items = parse_one_page(html)
106         for item in items:
107             jobs.append(item)
108         write_csv_rows(filename, headers, jobs)
109 
110 if __name__ == '__main__':
111     main('北京', 'python',2005  , 10)

智联招聘Python岗位信息搜集

查看全文

相关阅读:
周4早上搜索引擎分析 crmim.com| MSCRM开发者之家
 Bat命令学习
 sqlserver日期函数
 ubunto应用软件
 sql for xml
win7x64 连接oracle 客户端 vs 2010调试提示“ORA12154: TNS: 无法解析指定的连接标识符 ”ORA06413 问题（转）
CentOS Rsync服务端与Windows cwRsync客户端实现数据同步
 怎么引导2岁孩子洗手问题
 Libnfcinstallation
Asterisk资料

原文地址：https://www.cnblogs.com/hellangels333/p/8325886.html