# 抓取新氧数据
import requests
import json
import xlwt
from bs4 import BeautifulSoup
proxies={"http": "http://49.70.64.155:9999", "https": "http://59.57.148.70:9999", }
# 初始化表格行数
row=0
def get_shuhouhuli(url_diclist):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
for url_dic in url_diclist:
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = workbook.add_sheet('doctorinfo', cell_overwrite_ok=True)
for k,v in url_dic.items():
response = requests.get(v, headers = headers)
soup=BeautifulSoup(response.text,'lxml')
shuhouhulilist=soup.select("#surgery_after > div > div")
cols=0
global row
for shuhouhuli in shuhouhulilist:
print(shuhouhuli.text)
sheet.write(row, cols, shuhouhuli.text)
cols = cols + 1
row = row + 1
workbook.save("xinyanginfo.xls")
def get_finalurl(preurl):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
finalurl = []
try:
response=json.loads(requests.get(preurl,headers=headers).text)
for info in response:
try:
pinyin=info["seo"]["pinyin"]
finalurl.append({info["name"]: "https://www.soyoung.com/itemk/" + pinyin + "/"})
except:
print(info)
except:
print(preurl+"不可用")
return finalurl
def scra_data():
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = workbook.add_sheet('xinyanginfo', cell_overwrite_ok=True)
url=""
try:
for i in range(20155,20244):
# 得到一级url
url="https://www.soyoung.com/items/itemList?_json=1&menu_id="+str(i)
# 根据一级url抓取得到二级url的字典的列表
finalurldic=get_finalurl(url)
# 根据二级url抓取得到信息
for url_dic in finalurldic:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
for k, v in url_dic.items():
response = requests.get(v, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
shuhouhulilist = soup.select("#surgery_after > div > div")
cols = 2
global row
sheet.write(row, 0, k)
sheet.write(row, 1, v)
for shuhouhuli in shuhouhulilist:
sheet.write(row, cols, shuhouhuli.text)
cols = cols + 1
row = row + 1
except:
workbook.save("xinyanginfo.xls")
print(url)
workbook.save("xinyanginfo.xls")
scra_data()
记录一下抓取的代码,因为新氧的安全策略,所以代理需要频繁替换,估计抓四次左右即可抓全数据