zoukankan      html  css  js  c++  java
  • 并发爬取直聘网招聘信息

    #并发爬取直聘网找招聘信息
    import re import json import urllib import urllib.parse from urllib.request import urlopen from multiprocessing import Pool def get_content(target_url):#获取页面信息 headers = {'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'} req = urllib.request.Request(url=target_url, headers=headers) content = urllib.request.urlopen(req).read().decode("utf-8") return content #把返回值给回调函数get_msg def get_msg(content): obj = re.compile(r'ka="search_list_.*?<div class="job-title">(?P<job>.*?)</div>' r'.*?<span class="red">(?P<salary>.*?)</span>' r'.*?<p>(?P<adress>.*?)<em class="vline"></em>(?P<jingyan>.*?)<em class="vline"></em>(?P<xueli>.*?)</p>' r'.*?target="_blank">(?P<company>.*?)</a></h3>', re.S) f=open("java.txt","a",encoding="utf-8") g = obj.finditer(content) for el in g: dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company")} # dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company"), # "经验": el.group("jingyan"), "地址": el.group("adress"), "学历": el.group("xueli")} # dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company"), # "经验": el.group("jingyan"), "地址": el.group("adress"), "学历": el.group("xueli")} print(dic) s = json.dumps(dic, ensure_ascii=False) f.write(s + " ") f.close() if __name__ == '__main__': word = "开发" #目标地址中的中文 word = urllib.parse.quote(word) #通过urllib.parse.quote(word)使得目标网址中的中文可以访问,进行爬取 url_lst = [] # 装目标网址 for i in range(1, 11):#循环得到10个目标网页 url = "https://www.zhipin.com/c101280600/?query=Java%s&page=%s&ka=page-%s" % (word, i, i) url_lst.append(url) p=Pool(4) #创建进程池,可以同时进行4个任务 #目标任务有10个页面 for url in url_lst: p.apply_async(get_content,args=(url,),callback=get_msg) p.close() #不在向进程池中添加任务 p.join() #主进程等到子进程的结束
  • 相关阅读:
    [MacOS]Sublime text3 安装(一)
    [RHEL8]开启BBR
    PAT Advanced 1136 A Delayed Palindrome (20分)
    PAT Advanced 1144 The Missing Number (20分)
    PAT Advanced 1041 Be Unique (20分)
    PAT Advanced 1025 PAT Ranking (25分)
    PAT Advanced 1022 Digital Library (30分)
    PAT Advanced 1019 General Palindromic Number (20分)
    PAT Advanced 1011 World Cup Betting (20分)
    PAT Advanced 1102 Invert a Binary Tree (25分)
  • 原文地址:https://www.cnblogs.com/knighterrant/p/10040604.html
Copyright © 2011-2022 走看看