爬取拉勾网招聘的职位
比如我们要搜索python的职位
https://www.lagou.com/jobs/list_python/p-city_3?&cl=false&fromSearch=true&labelWords=&suginput=
https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
import requests
import json
import time
previous_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
craw_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'referer': 'https://www.lagou.com/jobs/list_python/p-city_3?&cl=false&fromSearch=true&labelWords=&suginput=',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}
# 建立 session
s = requests.Session()
# 获取搜索页的Cookies
s.get(previous_url,headers=header,timeout=3)
# 拿到此处获取的Cookie
cookie = s.cookies
# 获取此次的文本信息
for i in range(1,16):
payload = {
'first': 'true',
'pn': str(i),
'kd': 'python',
}
res = s.post(craw_url,data=payload,headers=header,timeout=5).text
recruit = json.loads(res)
print(recruit)
position_info = recruit.get('content').get('positionResult').get('result')
with open('position.txt',mode='ab+') as fw:
fw.write(json.dumps(position_info,ensure_ascii=False).encode('utf-8'))
fw.write('
'.encode('utf-8'))
time.sleep(20)
在爬取职位信息的时候,需要携带搜索页的cookie
爬取红楼梦小说
红楼梦小说网址:http://www.shicimingju.com/book/hongloumeng.html
import requests
from bs4 import BeautifulSoup
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}
response = requests.get('https://www.shicimingju.com/book/hongloumeng.html', headers=header)
soup = BeautifulSoup(response.text, 'lxml')
link_list = [
'https://www.shicimingju.com' + li.find('a').get('href')
for li in soup.select('.book-mulu li')
]
with open('hlm.txt', mode='ab+') as fw:
for link in link_list:
res = requests.get(link, headers=header)
soup2 = BeautifulSoup(res.text, 'lxml')
fw.write((soup2.select('.bookmark-list h1')[0].text).encode('utf-8'))
fw.write('
'.encode('utf-8'))
fw.write((soup2.select('.bookmark-list p')[0].text).encode('utf-8'))
fw.write('
'.encode('utf-8'))
爬取肯德基门店信息
import requests
res = requests.get("http://www.kfc.com.cn/kfccda/storelist/index.aspx")
with open('text2.html',mode='wb') as fw:
for line in res.iter_content():
fw.write(line)
import requests
import json
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
}
res = requests.post(
"http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx",
params={
'op': 'cname'
},
data={
'cname': '上海',
'pid': '',
'keyword':'',
'pageIndex': 1,
'pageSize': 500
},
headers=header
)
kfc_info = json.loads(res.text).get('Table1')
kfc_list = [
{
"storeName":kfc.get('storeName')+'餐厅',
"addressDetail":kfc.get("addressDetail"),
"pro":kfc.get("pro")
}
for kfc in kfc_info
]
print(kfc_list)
print(len(kfc_list)) #455
爬取糗事百科段子
糗事百科:https://www.qiushibaike.com/
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Cookie': '_xsrf=2|c757820a|8689eab698fb588fb9f2057ccf7d7ff7|1596541908; _qqq_uuid_="2|1:0|10:1596541909|10:_qqq_uuid_|56:N2E0ODM0MzQ0MzhhMmQ0ODhiN2VkOWEzZjZlNjgwZWIwYjFhYmUyOQ==|628d31f1d77ddca4ff48407bae2999366c0a036422afa9a71656a0f181373394"; gr_user_id=48d9d1c7-67fb-403b-8bec-b830ce07b762; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=706bbad7-66f7-4880-8b06-7c39369518e2; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1596541910; _ga=GA1.2.2084709124.1596541910; _gid=GA1.2.298303643.1596541910; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_706bbad7-66f7-4880-8b06-7c39369518e2=true; grwng_uid=62b3537d-3023-4060-a4fe-9a45f7e07d67; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1596542096',
}
details_list = []
for i in range(1,14):
url = f'https://www.qiushibaike.com/8hr/page/{i}/'
res = requests.get(url, headers=header)
soup = BeautifulSoup(res.text, 'lxml')
div_list = soup.select('.recmd-right')
for div in div_list:
try:
comment = div.find_all('span')[3].text
except Exception as e:
comment = 0
details = {
'subject': div.find('a').text,
'link': 'http://www.qiushibaike.com' + div.find('a').get('href'),
'support': div.find_all('span')[0].text,
'comment': comment,
'author':div.select('.recmd-name')[0].text
}
details_list.append(details)
print(details_list)
print(len(details_list)) # 189