第一个: 静态页面类爬取猫眼电影 TOP 100 , 应用beautifulsoup + requests
def getHtml(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def getContent(html, info_list):
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())
# print(soup.find('dl', attrs={'class': 'board-wrapper'}).children)
items = soup.find('dl', attrs={'class': 'board-wrapper'}).children
for item in items:
if isinstance(item, bs4.element.Tag):
title = item.find('p', attrs={'class': 'name'}).string
star = item.find('p', attrs={'class': 'star'}).string.strip()
releasetime = item.find('p', attrs={'class': 'releasetime'}).string
score = item.find('i', attrs={'class': 'integer'}).string + item.find('i',
attrs={'class': 'fraction'}).string
info_dict = {
'title': title,
'star': star,
'releasetime': releasetime,
'score': score,
}
info_list.append(info_dict)
return info_list
def saveFile(info_list):
with open('/Users/macmini-2/Desktop/GitDemo/DailyFresh/Day_fresh/Fresh/static/jsonFile.json', 'w') as f :
f.write(json.dumps(info_list))
pass
def main():
info_list = []
depth = 1
start_url = 'https://maoyan.com/board/4'
for i in range(depth):
url = start_url + '?offset=' + str(i*10)
html = getHtml(url)
content_json = getContent(html, info_list)
saveFile(content_json)
i = i+1
main()
第二个: 今日头条 (图集爬取)
import requests
import json
def getHtml(url):
try:
headers = {'User-Agent': 'MOzilla/5.0'}
r = requests.get(url, timeout=30, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def next(max_behot_time, data_list):
url = 'https://www.toutiao.com/api/pc/feed/?category=gallery_old_picture&utm_source=toutiao&max_behot_time=' + str(max_behot_time)
json_dict = json.loads(getHtml(url))
data = json_dict['data']
for i in data:
data_list.append(i)
return data_list
if __name__ == '__main__':
data_list = []
start_url = 'https://www.toutiao.com/api/pc/feed/?category=gallery_old_picture&utm_source=toutiao&max_behot_time=0'
text = getHtml(start_url)
json_dict = json.loads(text)
max_behot_time = json_dict['next']['max_behot_time']
data_list = json_dict['data']
data_list2 = next(max_behot_time, data_list)
print(data_list2)