爬虫大杂烩
"""
# 1 爬拉钩职位信息
import requests
headers = {
'Accept-Language': "zh-CN,zh;q=0.9",
'Host': 'www.lagou.com',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
'Referer': "https://www.lagou.com/jobs/list_python?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
'Cookie': "index_location_city=%E4%B8%8A%E6%B5%B7; user_trace_token=20200303202747-787f5b5e-8819-4d60-a8c0-3920aaf97b87; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22170a05dd252be-062a9d067fa6cc-366b420b-1049088-170a05dd25333f%22%2C%22%24device_id%22%3A%22170a05dd252be-062a9d067fa6cc-366b420b-1049088-170a05dd25333f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2279.0.3945.130%22%7D%7D; _ga=GA1.2.442852312.1586218701; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1586218701; LGUID=20200407081821-ea0bc584-cc75-43f0-9aa2-3c6fbe25bd10; JSESSIONID=ABAAAECAAFDAAEHA77B0A7162DFBDB833136F9E1BB7A309; WEBTJ-ID=20200407081848-1715200fade24c-0cb3e5dd9dd159-366b420b-1049088-1715200fadf446; _putrc=75D0A37619AD39A0123F89F2B170EADC; login=true; unick=%E5%8D%A0%E4%BA%9A%E5%B3%B0; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; privacyPolicyPopup=false; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=ed5749058ca1359c4174036851b1e35881c33e2f3e; gate_login_token=7c10fb5f4a047e902fb2a37fe1f50c11a9127b60c1b4a449e8fbaf21a885afc7; _gid=GA1.2.1134112076.1586353092; _gat=1; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fshanghai; PRE_SITE=https%3A%2F%2Fwww.lagou.com; LGSID=20200408213812-42f9d697-f383-473d-bf8d-0b78af930d27; hasDeliver=24; LGRID=20200408213814-dd89ae8a-ec86-4878-a2fb-863cec451b35; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1586353093",
'Accept': "application/json, text/javascript, */*; q=0.01",
'X-Anit-Forge-Code': "0",
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest'
}
form_data = {
'first': 'false',
'pn': 1,
'kd': 'python'
}
ret=requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false',
headers=headers,data=form_data)
print(ret.text)
# 2 爬cnblogs新闻
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.cnblogs.com/sitehome/p/3')
soup=BeautifulSoup(ret.text,'lxml')
article_list=soup.find_all(class_='post_item')
for article in article_list:
title=article.find(class_='titlelnk').text
href=article.find(class_='titlelnk')['href']
desc=article.find(class_='post_item_summary').text
author=article.find(class_='lightblue').text
print('''
文章标题:%s
文章地址:%s
文章摘要:%s
文章作者:%s
'''%(title,href,desc,author))
# 爬红楼梦小说
import requests
from bs4 import BeautifulSoup
ret = requests.get('http://www.shicimingju.com/book/hongloumeng.html')
soup = BeautifulSoup(ret.text, 'lxml')
li_list = soup.find(class_='book-mulu').find_all(name='li')
with open("红楼.txt", 'w', encoding='utf-8') as f:
for li in li_list:
title = li.find(name='a').text
url = li.find(name='a')['href']
# print(title)
f.write(title + '
')
ret_detail = requests.get('http://www.shicimingju.com' + url)
soup2 = BeautifulSoup(ret_detail.text, 'lxml')
content = soup2.find(class_='chapter_content').text
f.write(content + '
')
print(title, "写入")
# 微信机器人
from wxpy import *
from pyecharts import Pie
import webbrowser
bot=Bot(cache_path=True) #注意手机确认登录
friends=bot.friends()
#拿到所有朋友对象,放到列表里
attr=['男朋友','女朋友','未知性别']
value=[0,0,0]
for friend in friends:
if friend.sex == 1: # 等于1代表男性
value[0]+=1
elif friend.sex == 2: #等于2代表女性
value[1]+=1
else:
value[2]+=1
pie = Pie("朋友男女比例")
pie.add("", attr, value, is_label_show=True)
#图表名称str,属性名称list,属性所对应的值list,is_label_show是否现在标签
pie.render('sex.html')#生成html页面
# 打开浏览器
webbrowser.open("sex.html")
from wxpy import *
bot=Bot(cache_path=True)
@bot.register()
def recv_send_msg(recv_msg):
print('收到的消息:',recv_msg.text) # recv_msg.text取得文本
return '好的'
# 进入Python命令行,让程序保持运行
embed()
# 爬糗事百科
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)
soup=BeautifulSoup(ret.text,'lxml')
article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
content=article.find(class_='content').text
print(content)
print('-------')
# 爬肯德基门店
import requests
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
}
data = {
'cname': '',
'pid': 20,
'keyword': '浦东',
'pageIndex': 1,
'pageSize': 10
}
ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
print(ret.text)
"""