zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup


    def catchSoup(url):
    #url=‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml/‘
    res=requests.get(url)
    res.encoding=‘utf-8‘
    soup=BeautifulSoup(res.text,‘html.parser‘)
    return soup

    def kindSearch(soup):
    herbKind=[]
    for new in soup.select(‘li‘):
    if(new.text!=‘首页‘):
    perKind=[]
    perKind.append(new.text)
    perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
    herbKind.append(perKind)
    return herbKind


    def nameSearch(soup):
    herbName=[]
    for new in soup.select(‘h3‘):
    pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
    pername=pername.rstrip(‘读书‘)
    herbName.append(pername)
    return herbName

    def perPage(soup):
    kindPage=[]
    add=[]
    for new in soup.select(‘.post.pagebar‘):
    for detail in new.select(‘a‘):
    d=[]
    d.append(detail.text)
    d.append(detail.attrs[‘href‘])
    kindPage.append(d)
    kindPage.remove(kindPage[0])
    kindPage.remove(kindPage[-1])
    return kindPage
    def herbDetail(kind):
    soup=catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml‘)
    kindName=kindSearch(soup)[kind][0]
    adds=kindSearch(soup)[kind][1]
    totalRecord = []
    print("正在爬取 "+str(kind)+‘.‘+kindName)
    totalRecord.append(nameSearch(catchSoup(adds)))
    for add in perPage(catchSoup(adds)):
    pageAdd=add[1]
    totalRecord.append(nameSearch(catchSoup(pageAdd)))
    #print(nameSearch(catchSoup(pageAdd)))
    print(totalRecord)
    return totalRecord


    if __name__=="__main__":
    totalKind=kindSearch(catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-
    ifzqvvsa2785251.shtml‘))
    totalRecord=[]
    kind=0
    detailContent = ‘‘
    while(kind<20):
    totalRecord=herbDetail(kind)
    if(kind==0):
    detailContent+=‘目录: ‘
    for i in totalKind:
    detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
    kind+=1
    continue
    else:
    detailContent+=‘ ‘+str(totalKind[kind][0])+‘: ‘
    for i in totalRecord:
    detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
    kind+=1

    f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
    f.write(detailContent)
    f.close()

  • 相关阅读:
    [shell]Shell经常使用特殊符号
    谈谈大三找暑假实习
    使用zTree控件制作的表格形式的树形+数据菜单
    Bestcoder #47 B Senior&#39;s Gun
    使用awrextr.sql导出awr原始数据
    linux/shell 文本文件删除/删掉空行
    python 统计文本文件的行数
    check if a linux process is done using bash 检查进程是否在运行
    umount移动硬盘遇到device is busy问题
    Python读写文件
  • 原文地址:https://www.cnblogs.com/yh5788lz/p/8970978.html
Copyright © 2011-2022 走看看