zoukankan      html  css  js  c++  java
  • 爬取前尘无忧python职位信息并保存到mongo数据库

    1.re实现

     1 import re,os
     2 import requests
     3 from requests.exceptions import RequestException
     4 
     5 MAX_PAGE = 10 #最大页数
     6 KEYWORD = 'python'
     7 headers = {
     8     'User-Agent':
     9         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    10 }
    11 file_name = 're_job51_python.txt'
    12 
    13 # 获取网页源码
    14 def getHtml(page):
    15     try:
    16         url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{0},2,{1}.html?'.format(KEYWORD,page)
    17         response = requests.get(url,headers=headers)
    18         response.encoding = response.apparent_encoding
    19         return response.text
    20     except RequestException:
    21         print('请求出错')
    22         return None
    23 
    24 # 解析网页源码,得到目标信息
    25 def getTarget(html):
    26     reg = re.compile(
    27         r'class="t1 ">.*? <a target="_blank" '
    28         'title="(.*?)".*? <span class="t2"><a target="_blank" '
    29         'title="(.*?)".*?<span '
    30         'class="t3">(.*?)</span>.*?<span '
    31         'class="t4">(.*?)</span>.*? <span '
    32         'class="t5">(.*?)</span>',
    33         re.S)  # 匹配换行符
    34     target = re.findall(reg,html)
    35     return target
    36 
    37 
    38 # 保存到文本中
    39 def save_to_txt(item):
    40     with open(file_name,'a',newline='') as f:  # newline参数防止两行之间有空行
    41         for i in range(len(item)):
    42             # 最后一个元素换行,非最后则以','隔开
    43             if i == len(item)-1:
    44                 f.write(item[i])
    45                 f.write('
    ')
    46             else:
    47                 f.write(item[i]+',')
    48 
    49 def main():
    50     # 每次执行前检查文件是否存在,存在则删除
    51     if os.path.exists(file_name):
    52         os.remove(file_name)
    53 
    54     # 分页爬取
    55     for page in range(MAX_PAGE+1):
    56         html = getHtml(page)
    57         content = getTarget(html)
    58         for item in content:
    59             save_to_txt(item)
    60 
    61 if __name__ == '__main__':
    62     main()
    View Code

     2.xpath实现

      1 import os
      2 import requests
      3 from requests.exceptions import RequestException
      4 from lxml import etree
      5 import pymongo
      6 from spiders.前程无忧.mongo_config import *
      7 
      8 # mongo数据库设置
      9 client = pymongo.MongoClient(MONGO_URL)
     10 db = client[MONGO_DB]
     11 
     12 MAX_PAGE = 5
     13 KEYWORD = 'python'
     14 headers = {
     15     'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
     16     'Chrome/63.0.3239.132 Safari/537.36'
     17 }
     18 file_name = 'xpath_job51_python.txt'
     19 
     20 # 获取网页
     21 def get_html(page):
     22     try:
     23         url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html?'.format(KEYWORD,page)
     24         response = requests.get(url,headers=headers)
     25         response.encoding = response.apparent_encoding
     26         return response.text
     27     except RequestException:
     28         return None
     29 
     30 # 解析网页
     31 def parse_html(html):
     32     # 构造xpath解析对象,可自动修整HTML文本
     33     html = etree.HTML(html)
     34     # 获取文本 /text()
     35     # 获取属性 /@href
     36     # 获取第i个标签 /tar_name[i]  从1开始
     37     # normalize-space-->去空格换行符
     38     # position_name = html.xpath('normalize-space(//div[@class="el"]/p/span/a/text())')
     39 
     40     # 职位名称,
     41     position_names = []
     42     for name in html.xpath('//div[@class="el"]/p/span/a/text()'):
     43         position_name = name.strip()
     44         position_names.append(position_name)
     45 
     46     # 职位地址
     47     position_urls = html.xpath('//div[@class="el"]/p/span/a/@href')
     48 
     49     # 公司名称
     50     company_names = html.xpath('//div[@class="el"]/span[1]/a/text()')
     51 
     52     # 公司地址
     53     company_urls = html.xpath('//div[@class="el"]/span[1]/a/@href')
     54 
     55     # 位置
     56     locations = html.xpath('//div[@class="el"]/span[@class="t3"]/text()')
     57 
     58     # 薪资
     59     salarys = html.xpath('//div[@class="el"]/span[@class="t4"]/text()')
     60 
     61     # 发布时间
     62     release_dates = html.xpath('//div[@class="el"]/span[4]/text()')
     63 
     64     result = zip(position_names,position_urls,company_names,company_urls,locations,salarys,release_dates)
     65     return result
     66 
     67 
     68 def save_to_txt(element):
     69     with open(file_name,'a',newline='') as f:
     70         for i in range(len(element)):
     71             # data = ','.join(element[i])
     72             if i == len(element)-1:
     73                 f.write(element[i])
     74                 f.write('
    ')
     75             else:
     76                 f.write(element[i]+',')
     77 
     78 
     79 def save_to_mongo(element):
     80     keys = ['position_name','position_url','company_name',
     81             'company_url','location','salary','release_date']
     82     result = dict(zip(keys,list(element)))
     83     if db[MONGO_TABLE_XPATH].insert(result):
     84         print('数据成功存储到mongo数据库中')
     85         return True
     86     return False
     87 
     88     # 遍历字典元素
     89     # for k,v in result.items():
     90     #     print(k,':',v)
     91     for key in result:
     92         print(key,':',result[key])
     93 
     94 
     95 
     96 def main():
     97     if os.path.exists(file_name):
     98         os.remove(file_name)
     99     for page in range(1,MAX_PAGE+1):
    100         html = get_html(page)
    101         elements = parse_html(html)
    102         if elements:
    103             for element in elements:
    104                 save_to_txt(element)
    105                 save_to_mongo(element)
    106 
    107 if __name__ == '__main__':
    108     main()
    View Code


  • 相关阅读:
    【C++ 学习笔记】 Vector
    【AWS】 AWS Free Usage Tier
    【C++ 学习笔记】 MFC CEdit
    【MySql】MySql安装和ODBC设置
    【C++ 学习笔记】 变量转换
    【Perl学习笔记】列表和数组
    【C++ 学习笔记】 值传递
    【Java 学习笔记】 MyEclipse各种细节
    【NLP】 向量空间模型
    【Linux】 Cygwin操作总结
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9373742.html
Copyright © 2011-2022 走看看