import requests from bs4 import BeautifulSoup def catchSoup(url): #url=‘http://www.18ladys.com/post/buchong/‘ res=requests.get(url) res.encoding=‘utf-8‘ soup=BeautifulSoup(res.text,‘html.parser‘) return soup def kindSearch(soup): herbKind=[] for new in soup.select(‘li‘): if(new.text!=‘首页‘): perKind=[] perKind.append(new.text) perKind.append(new.select(‘a‘)[0].attrs[‘href‘]) herbKind.append(perKind) return herbKind def nameSearch(soup): herbName=[] for new in soup.select(‘h3‘): pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘) pername=pername.rstrip(‘的功效与作用‘) herbName.append(pername) return herbName def perPage(soup): kindPage=[] add=[] for new in soup.select(‘.post.pagebar‘): for detail in new.select(‘a‘): d=[] d.append(detail.text) d.append(detail.attrs[‘href‘]) kindPage.append(d) kindPage.remove(kindPage[0]) kindPage.remove(kindPage[-1]) return kindPage def herbDetail(kind): soup=catchSoup(‘http://www.18ladys.com/post/buchong/‘) kindName=kindSearch(soup)[kind][0] adds=kindSearch(soup)[kind][1] totalRecord = [] print("正在爬取 "+str(kind)+‘.‘+kindName) totalRecord.append(nameSearch(catchSoup(adds))) for add in perPage(catchSoup(adds)): pageAdd=add[1] totalRecord.append(nameSearch(catchSoup(pageAdd))) #print(nameSearch(catchSoup(pageAdd))) print(totalRecord) return totalRecord if __name__=="__main__": totalKind=kindSearch(catchSoup(‘http://www.18ladys.com/post/buchong/‘)) totalRecord=[] kind=0 detailContent = ‘‘ while(kind<20): totalRecord=herbDetail(kind) if(kind==0): detailContent+=‘目录: ‘ for i in totalKind: detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘ kind+=1 continue else: detailContent+=‘ ‘+str(totalKind[kind][0])+‘: ‘ for i in totalRecord: detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘ kind+=1 f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘) f.write(detailContent) f.close()