爬虫大作业 - 走看看

zoukankan html css js c++ java

爬虫大作业

import requests
from bs4 import BeautifulSoup

def catchSoup(url):
#url=‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml/‘
res=requests.get(url)
res.encoding=‘utf-8‘
soup=BeautifulSoup(res.text,‘html.parser‘)
return soup

def kindSearch(soup):
herbKind=[]
for new in soup.select(‘li‘):
if(new.text!=‘首页‘):
perKind=[]
perKind.append(new.text)
perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
herbKind.append(perKind)
return herbKind

def nameSearch(soup):
herbName=[]
for new in soup.select(‘h3‘):
pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
pername=pername.rstrip(‘读书‘)
herbName.append(pername)
return herbName

def perPage(soup):
kindPage=[]
add=[]
for new in soup.select(‘.post.pagebar‘):
for detail in new.select(‘a‘):
d=[]
d.append(detail.text)
d.append(detail.attrs[‘href‘])
kindPage.append(d)
kindPage.remove(kindPage[0])
kindPage.remove(kindPage[-1])
return kindPage
def herbDetail(kind):
soup=catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml‘)
kindName=kindSearch(soup)[kind][0]
adds=kindSearch(soup)[kind][1]
totalRecord = []
print("正在爬取 "+str(kind)+‘.‘+kindName)
totalRecord.append(nameSearch(catchSoup(adds)))
for add in perPage(catchSoup(adds)):
pageAdd=add[1]
totalRecord.append(nameSearch(catchSoup(pageAdd)))
#print(nameSearch(catchSoup(pageAdd)))
print(totalRecord)
return totalRecord

if __name__=="__main__":
totalKind=kindSearch(catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-
ifzqvvsa2785251.shtml‘))
totalRecord=[]
kind=0
detailContent = ‘‘
while(kind<20):
totalRecord=herbDetail(kind)
if(kind==0):
detailContent+=‘目录： ‘
for i in totalKind:
detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
kind+=1
continue
else:
detailContent+=‘ ‘+str(totalKind[kind][0])+‘: ‘
for i in totalRecord:
detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
kind+=1

f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
f.write(detailContent)
f.close()

查看全文

相关阅读:
网页加载进度条
 【转载】通俗易懂，什么是.NET?什么是.NET Framework？什么是.NET Core?
前端知识复习：Html DIV 图文混排（文字放在图片下边）
NOPI导出Excel
C# 发送邮件
 DataSetToList 和 DataTableTolist 转换
 一个修改版的PHP ajax Tree树形菜单
 你的站为什么百度无动于衷——10年老站长的SEO肺腑之言 .
关于php使用phpqrcode生成二维码的完整源码下载
 php结合phpqrcode生成带图片LOGO的二维码

原文地址：https://www.cnblogs.com/yh5788lz/p/8970978.html