常用模块 requests BeautifulSoup
1 汽车之家爬虫练习
import requests from bs4 import BeautifulSoup ret = requests.get(url="https://www.autohome.com.cn/news/") ret.encoding = ret.apparent_encoding # print(ret.text) soup = BeautifulSoup(ret.text,'html.parser') div = soup.find(name='div',id='auto-channel-lazyload-article') li_list = div.find_all(name='li') for li in li_list: h3 = li.find(name='h3') if not h3: continue # print(h3.text) p = li.find(name='p') # print(p.text) a = li.find(name='a') # print(a.get('href')) img = li.find('img') src = img.get('src') file_name = src.rsplit('__',1)[1] print(file_name) ret_img = requests.get( url= 'https:'+src ) with open(file_name,'wb') as f: f.write(ret_img.content)
2 抽屉登陆点赞练习
import requests from bs4 import BeautifulSoup # 先访问页面,返回cookie r1 = requests.get(url='https://dig.chouti.com/all/new/1', headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"}) r1_cookie_dict = r1.cookies.get_dict() # 登录请求,参数的设置 response_login = requests.post(url='https://dig.chouti.com/login', data={ "phone":"8618387391326", "password":"zmq251010", 'oneMonth':'1' }, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"}, cookies = r1_cookie_dict ) print(response_login.text) for i in range(3,5): response_index = requests.get(url='https://dig.chouti.com/all/new/%s' %i, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"}) soup = BeautifulSoup(response_index.text,'html.parser') div = soup.find(attrs={'id':'content-list'}) items = div.find_all(attrs={'class':'item'}) for item in items: tag = item.find(attrs={'class':'part2'}) nid = tag.get('share-linkid') # 根据每个新点赞操作 ret = requests.post(url='https://dig.chouti.com/link/vote?linksId=%s' %nid, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)"}, cookies =r1_cookie_dict ) print(ret.text)
request常用参数
import requests
requests.get(
url='x',
params={'nid':1,'name':'x'},#x?nid=1&name=x
header={},
cookies={}
)
requests.post(
url='x',
data={
'name':'alex',
'age':18
},
header= {},
cookie = {},
)
request参数
method:
url:
params:
data:
json:
headers:
cookies:
files:上传文件