zoukankan html css js c++ java

python简单爬去前程无忧信息招聘

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
    
import requests
import csv
from BeautifulSoup import BeautifulSoup

def get_content(page):
    url = 'http://search.51job.com/list/200200,000000,0000,32,9,99,python,2,'+str(page)+'.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    reponse = requests.get(url)
    html = reponse.content
    soup = BeautifulSoup(html)
    return soup

def get(soup):
    inf_list = list()
    tag1 = soup.find('div', attrs={'class': 'jblist res'})    
    for label in tag1.findAll('a',attrs={'class':'e e2 eck'}):
        title = label.find('h3').text.strip()
        company = label.find('aside').text.strip()
        money = label.find('em').text.strip()
        inf_list.append((title, company, money))
    with open("imdb.csv","a") as f:
        fw = csv.writer(f)
#    fw.writerow(['职位','公司','薪资'])
        fw.writerows(inf_list)
#    return inf_list

with open("imdb.csv","wb") as f:
    fw = csv.writer(f)
    fw.writerow(['职位','公司','薪资'])
    for j in range(1, 10):
        print  "-----正在爬第"+str(j)+"页内容---------"
        html = get_content(j)
        get(html)

查看全文

相关阅读:
后端开发应该掌握的 Redis 基础
 Code Review有什么好处？
对不起，你那不叫努力，叫重复劳动
 老鸟程序员才知道的40个小技巧
 单例模式基础笔记
 最受IT公司欢迎的50款开源软件
 硬件：关于路由器、交换机、宽带猫的几个问题
 硬件：宽带猫（光猫）的基础知识
 python selenium模块使用出错解决，Message: ‘geckodriver’ executable needs to be in PATH
python+selenium如何定位页面的元素，的几种定位元素的方法。

原文地址：https://www.cnblogs.com/Kermit-Li/p/6848936.html