zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup
    
    
    def catchSoup(url):
        #url=‘http://www.18ladys.com/post/buchong/‘
        res=requests.get(url)
        res.encoding=‘utf-8‘
        soup=BeautifulSoup(res.text,‘html.parser‘)
        return soup
    
    def kindSearch(soup):
        herbKind=[]
        for new in soup.select(‘li‘):
            if(new.text!=‘首页‘):
                perKind=[]
                perKind.append(new.text)
                perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
                herbKind.append(perKind)
        return herbKind
    
    
    def nameSearch(soup):
        herbName=[]
        for new in soup.select(‘h3‘):
            pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
            pername=pername.rstrip(‘的功效与作用‘)
            herbName.append(pername)
        return herbName
    
    def perPage(soup):
        kindPage=[]
        add=[]
        for new in soup.select(‘.post.pagebar‘):
            for detail in new.select(‘a‘):
                d=[]
                d.append(detail.text)
                d.append(detail.attrs[‘href‘])
                kindPage.append(d)
        kindPage.remove(kindPage[0])
        kindPage.remove(kindPage[-1])
        return kindPage
    def herbDetail(kind):
        soup=catchSoup(‘http://www.18ladys.com/post/buchong/‘)
        kindName=kindSearch(soup)[kind][0]       
        adds=kindSearch(soup)[kind][1]           
        totalRecord = []                        
        print("正在爬取 "+str(kind)+‘.‘+kindName)
        totalRecord.append(nameSearch(catchSoup(adds)))
        for add in perPage(catchSoup(adds)):           
            pageAdd=add[1]
            totalRecord.append(nameSearch(catchSoup(pageAdd)))
            #print(nameSearch(catchSoup(pageAdd)))
        print(totalRecord)
        return totalRecord
    
    
    if __name__=="__main__":
           totalKind=kindSearch(catchSoup(‘http://www.18ladys.com/post/buchong/‘))     totalRecord=[]
        kind=0
        detailContent = ‘‘
        while(kind<20):
            totalRecord=herbDetail(kind)
            if(kind==0):
                detailContent+=‘目录:
    ‘
                for i in totalKind:
                    detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
                kind+=1
                continue
            else:
                detailContent+=‘
    ‘+str(totalKind[kind][0])+‘:
    ‘
            for i in totalRecord:
                detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
            kind+=1
    
    f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
    f.write(detailContent)
    f.close()





  • 相关阅读:
    silverlight重写TextBox PassWordBox
    android使用html开发
    SilverLight中DataGrid显示值转换
    STM32 GPIO 的配置与使用
    开始。。
    VCS使用指令
    【转】关闭 Windows 7中的 6to4 隧道
    DC中关于list、双引号和花括号的使用区别
    解决了microblaze在ISE中例化时自动插入IO buffer
    Recovery time 和 Removal time的概念
  • 原文地址:https://www.cnblogs.com/SOLARLKS/p/8970848.html
Copyright © 2011-2022 走看看