zoukankan      html  css  js  c++  java
  • Python_爬虫项目

    1、爬虫——智联招聘信息搜集    原文链接

      1 #-*- coding: utf-8 -*-
      2 import re
      3 import csv
      4 import requests
      5 from tqdm import tqdm
      6 from urllib.parse import urlencode
      7 from requests.exceptions import RequestException
      8 
      9 def get_one_page(city, keyword, region, page):
     10     '''
     11     获取网页html内容并返回
     12     '''
     13     paras = {
     14         'jl': city,         # 搜索城市
     15         'kw': keyword,      # 搜索关键词
     16         'isadv': 0,         # 是否打开更详细搜索选项
     17         'isfilter': 1,      # 是否对结果过滤
     18         'p': page,          # 页数
     19         're': region        # region的缩写,地区,2005代表海淀
     20     }
     21 
     22     headers = {
     23         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
     24         'Host': 'sou.zhaopin.com',
     25         'Referer': 'https://www.zhaopin.com/',
     26         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
     27         'Accept-Encoding': 'gzip, deflate, br',
     28         'Accept-Language': 'zh-CN,zh;q=0.9'
     29     }
     30 
     31     url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
     32     try:
     33         # 获取网页内容,返回html数据
     34         response = requests.get(url, headers=headers)
     35         # 通过状态码判断是否获取成功
     36         if response.status_code == 200:
     37             return response.text
     38         return None
     39     except RequestException as e:
     40         return None
     41 
     42 def parse_one_page(html):
     43     '''
     44     解析HTML代码,提取有用信息并返回
     45     '''
     46     # 正则表达式进行解析
     47     pattern = re.compile('<a style=.*? target="_blank">(.*?)</a>.*?'        # 匹配职位信息
     48         '<td class="gsmc"><a href="(.*?)" target="_blank">(.*?)</a>.*?'     # 匹配公司网址和公司名称
     49         '<td class="zwyx">(.*?)</td>', re.S)                                # 匹配月薪
     50 
     51     # 匹配所有符合条件的内容
     52     items = re.findall(pattern, html)
     53 
     54     for item in items:
     55         job_name = item[0]
     56         job_name = job_name.replace('<b>', '')
     57         job_name = job_name.replace('</b>', '')
     58         yield {
     59             'job': job_name,
     60             'website': item[1],
     61             'company': item[2],
     62             'salary': item[3]
     63         }
     64 
     65 def write_csv_file(path, headers, rows):
     66     '''
     67     将表头和行写入csv文件
     68     '''
     69     # 加入encoding防止中文写入报错
     70     # newline参数防止每写入一行都多一个空行
     71     with open(path, 'a', encoding='gb18030', newline='') as f:
     72         f_csv = csv.DictWriter(f, headers)
     73         f_csv.writeheader()
     74         f_csv.writerows(rows)
     75 
     76 def write_csv_headers(path, headers):
     77     '''
     78     写入表头
     79     '''
     80     with open(path, 'a', encoding='gb18030', newline='') as f:
     81         f_csv = csv.DictWriter(f, headers)
     82         f_csv.writeheader()
     83 
     84 def write_csv_rows(path, headers, rows):
     85     '''
     86     写入行
     87     '''
     88     with open(path, 'a', encoding='gb18030', newline='') as f:
     89         f_csv = csv.DictWriter(f, headers)
     90         f_csv.writerows(rows)
     91 
     92 def main(city, keyword, region, pages):
     93     '''
     94     主函数
     95     '''
     96     filename = 'zl_' + city + '_' + keyword + '.csv'
     97     headers = ['job', 'website', 'company', 'salary']
     98     write_csv_headers(filename, headers)
     99     for i in tqdm(range(pages)):
    100         '''
    101         获取该页中所有职位信息,写入csv文件
    102         '''
    103         jobs = []
    104         html = get_one_page(city, keyword, region, i)
    105         items = parse_one_page(html)
    106         for item in items:
    107             jobs.append(item)
    108         write_csv_rows(filename, headers, jobs)
    109 
    110 if __name__ == '__main__':
    111     main('北京', 'python',2005  , 10)
    智联招聘Python岗位信息搜集
  • 相关阅读:
    周4早上搜索引擎分析 crmim.com| MSCRM开发者之家
    Bat命令学习
    sqlserver日期函数
    ubunto应用软件
    sql for xml
    win7x64 连接oracle 客户端 vs 2010调试 提示“ORA12154: TNS: 无法解析指定的连接标识符 ”ORA06413 问题(转)
    CentOS Rsync服务端与Windows cwRsync客户端实现数据同步
    怎么引导2岁孩子洗手问题
    Libnfcinstallation
    Asterisk资料
  • 原文地址:https://www.cnblogs.com/hellangels333/p/8325886.html
Copyright © 2011-2022 走看看