zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup
    
    
    def catchSoup(url):
        #url=‘http://www.18ladys.com/post/buchong/‘
        res=requests.get(url)
        res.encoding=‘utf-8‘
        soup=BeautifulSoup(res.text,‘html.parser‘)
        return soup
    
    def kindSearch(soup):
        herbKind=[]
        for new in soup.select(‘li‘):
            if(new.text!=‘首页‘):
                perKind=[]
                perKind.append(new.text)
                perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
                herbKind.append(perKind)
        return herbKind
    
    
    def nameSearch(soup):
        herbName=[]
        for new in soup.select(‘h3‘):
            pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
            pername=pername.rstrip(‘的功效与作用‘)
            herbName.append(pername)
        return herbName
    
    def perPage(soup):
        kindPage=[]
        add=[]
        for new in soup.select(‘.post.pagebar‘):
            for detail in new.select(‘a‘):
                d=[]
                d.append(detail.text)
                d.append(detail.attrs[‘href‘])
                kindPage.append(d)
        kindPage.remove(kindPage[0])
        kindPage.remove(kindPage[-1])
        return kindPage
    def herbDetail(kind):
        soup=catchSoup(‘http://www.18ladys.com/post/buchong/‘)
        kindName=kindSearch(soup)[kind][0]       
        adds=kindSearch(soup)[kind][1]           
        totalRecord = []                        
        print("正在爬取 "+str(kind)+‘.‘+kindName)
        totalRecord.append(nameSearch(catchSoup(adds)))
        for add in perPage(catchSoup(adds)):           
            pageAdd=add[1]
            totalRecord.append(nameSearch(catchSoup(pageAdd)))
            #print(nameSearch(catchSoup(pageAdd)))
        print(totalRecord)
        return totalRecord
    
    
    if __name__=="__main__":
           totalKind=kindSearch(catchSoup(‘http://www.18ladys.com/post/buchong/‘))     totalRecord=[]
        kind=0
        detailContent = ‘‘
        while(kind<20):
            totalRecord=herbDetail(kind)
            if(kind==0):
                detailContent+=‘目录:
    ‘
                for i in totalKind:
                    detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
                kind+=1
                continue
            else:
                detailContent+=‘
    ‘+str(totalKind[kind][0])+‘:
    ‘
            for i in totalRecord:
                detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
            kind+=1
    
    f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
    f.write(detailContent)
    f.close()





  • 相关阅读:
    Data Structure 之 KMC字符串匹配算法
    网站建站流程
    常用算法稳定性分析
    VSS错误:The Sourcesafe Web service cannot be accessed at the specified address
    Data Struture 之 指针
    Windows 之 CMD命令
    Data Structure 之 算法设计策略
    Data Structure 之 最优二叉树
    板级支持包(BSP)
    JPEG
  • 原文地址:https://www.cnblogs.com/SOLARLKS/p/8970848.html
Copyright © 2011-2022 走看看