zoukankan      html  css  js  c++  java
  • 并发爬取直聘网招聘信息

    #并发爬取直聘网找招聘信息
    import re import json import urllib import urllib.parse from urllib.request import urlopen from multiprocessing import Pool def get_content(target_url):#获取页面信息 headers = {'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'} req = urllib.request.Request(url=target_url, headers=headers) content = urllib.request.urlopen(req).read().decode("utf-8") return content #把返回值给回调函数get_msg def get_msg(content): obj = re.compile(r'ka="search_list_.*?<div class="job-title">(?P<job>.*?)</div>' r'.*?<span class="red">(?P<salary>.*?)</span>' r'.*?<p>(?P<adress>.*?)<em class="vline"></em>(?P<jingyan>.*?)<em class="vline"></em>(?P<xueli>.*?)</p>' r'.*?target="_blank">(?P<company>.*?)</a></h3>', re.S) f=open("java.txt","a",encoding="utf-8") g = obj.finditer(content) for el in g: dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company")} # dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company"), # "经验": el.group("jingyan"), "地址": el.group("adress"), "学历": el.group("xueli")} # dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company"), # "经验": el.group("jingyan"), "地址": el.group("adress"), "学历": el.group("xueli")} print(dic) s = json.dumps(dic, ensure_ascii=False) f.write(s + " ") f.close() if __name__ == '__main__': word = "开发" #目标地址中的中文 word = urllib.parse.quote(word) #通过urllib.parse.quote(word)使得目标网址中的中文可以访问,进行爬取 url_lst = [] # 装目标网址 for i in range(1, 11):#循环得到10个目标网页 url = "https://www.zhipin.com/c101280600/?query=Java%s&page=%s&ka=page-%s" % (word, i, i) url_lst.append(url) p=Pool(4) #创建进程池,可以同时进行4个任务 #目标任务有10个页面 for url in url_lst: p.apply_async(get_content,args=(url,),callback=get_msg) p.close() #不在向进程池中添加任务 p.join() #主进程等到子进程的结束
  • 相关阅读:
    2020年“安洵杯”四川省大学生信息安全技术大赛 Misc WP
    整数划分问题
    二叉树根节点到叶子节点的所有路径和
    java正则表达式
    搜狗笔试
    跟谁学0923笔试
    360 笔试0926
    度小满0920
    TreeMap 常用函数
    达达0920
  • 原文地址:https://www.cnblogs.com/knighterrant/p/10040604.html
Copyright © 2011-2022 走看看