zoukankan      html  css  js  c++  java
  • 爬取知乎 关于 英文名

      主类:

    from Dao.Requests_dao import Rqst
    from Dao.Csv_dao import Csv
    
    def paqu(x):
    	print('爬取:知乎“英文取名”:')
    	url='https://www.zhihu.com/topic/19561718/top-answers'
    	RQST=Rqst()#初始化爬取工具对象
    	CSV=Csv()#初始化存储工具对象
    
    	#获取首页docx
    	print('首页url:'+url)
    	try:
    		docx=RQST._init_(url)
    		docx.raise_for_status()
    		print('首页连接成功!')
    	except:
    		print('error:首页连接失败!')
    	print('------------------------------------------------------------------------------')
    
    	#爬取首页内所有的问题的url,指定数量,返回url列表
    	rs=RQST.find_all_question(x)
    
    	#遍历所有的问题的url
    	number=0
    	for i in rs:
    		number+=1
    		print(number)
    		data=RQST.get_content(i)
    		CSV.save(data)
    
    if __name__ == '__main__':
    	paqu(1)
    

      爬取类:

    import requests
    from bs4 import BeautifulSoup
    import urllib
    import json
    
    import re
    import os
    
    # import io
    # import sys
    # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
    class Rqst:
    	def _init_(self,url):
    		headers={
    		'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    		}
    		docx=requests.get(url,headers=headers)
    		docx.encoding='utf-8'
    		return docx
    
    	#获取所有的url,形成列表返回
    	def find_all_question(self,number):
    		#该界面为动态网页,所以用接口爬取json获取url
    		apiurl='https://www.zhihu.com/api/v4/topics/19561718/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset='
    		headers={
    				'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    				}
    		n=0#计数
    		rs=[]#url列表
    		i=0
    		#爬取指定条数的问题的url
    		while True:
    			docx=requests.get(apiurl+str(i),headers=headers)
    			i+=10
    			docx.encoding='unicode'
    			urlst=[]
    			if os.path.exists(r"saveurl.csv"):
    				with open('saveurl.csv','r',encoding='utf-8-sig') as f:
    					urlst=f.readlines()
    			for j in range(10):
    				js=json.loads(docx.text)['data'][j]
    				try:
    					r='https://www.zhihu.com/question/'+str(js.get('target').get('question').get('id'))+'/answer/'+str(js.get('target').get('id'))
    					if r+'
    ' in urlst:
    						break
    					else:
    						with open('saveurl.csv','a',encoding='utf-8-sig') as f:
    							f.write(r+'
    ')
    						rs.append(r)
    						n+=1
    				except:
    				 	pass
    				if n>=number:
    					return rs
    			if n>=number:
    				return rs
    
    	#获取问题
    	def get_content_question(self,bs):
    		rs=bs.find('div',{'class':'QuestionHeader'}).find('h1')
    		question=rs.text
    		return question
    
    	#获取问题回答
    	def get_content_answerContent(self,bs):
    		rs=bs.find('div',{'class':'QuestionAnswer-content'}).find_all('p')
    		content=''
    		for i in rs:
    			content=content+i.text
    		respond=content.replace('/','or').replace('/n','').replace(',',',')
    		return respond
    
    	#获取作者信息
    	def get_content_authorInfo(self,bs):
    		b1=bs.find('div',{'class':'ContentItem AnswerItem'})
    
    		#作者的名字
    		b1_1=b1.find('div',{'class':'AuthorInfo-content'}).find('a',{'data-za-detail-view-element_name':'User'})
    		try:
    			author='作者名称:'+b1_1.text
    		except:
    			author='作者名称:匿名用户'
    
    		#作者的描述
    		b1_2=b1.find('div',{'class':'ztext AuthorInfo-badgeText'})
    		try:
    			author_describe='作者描述:'+b1_2.text.replace('
    ',',')
    		except:
    			author_describe='作者描述:无信息'
    
    		#作者的回答的赞同人数
    		b1_3=b1.find('meta',{'itemprop':'upvoteCount'})
    		try:
    			author_agreeNumber='赞同人数:'+b1_3.get('content')
    		except:
    			author_agreeNumber='赞同人数:无信息'
    		
    		#回答发布日期
    		b1_4=b1.find('meta',{'itemprop':'dateCreated'})
    		try:
    			author_dateCreated='发布日期:'+b1_4.get('content').replace('T','*').replace('.000Z','')
    		except:
    			author_dateCreated='发布日期:无信息'
    
    		#回答编辑日期
    		b1_5=b1.find('meta',{'itemprop':'dateModified'})
    		try:
    			author_dateModified='编辑日期:'+b1_5.get('content').replace('T','*').replace('.000Z','')
    		except:
    			author_dateModified='编辑日期:无信息'
    
    		data=[
    			author,
    			author_describe,
    			author_agreeNumber,
    			author_dateCreated,
    			author_dateModified
    		]
    		return data
    
    	#获取讨论
    	def get_content_discuss(self,bs):
    		#得到讨论api
    		rs=bs.find('div',{'class':'Card AnswerCard'}).find('div',{'class':'ContentItem AnswerItem'})
    		url='https://www.zhihu.com/api/v4/answers/'+rs.get('name')+'/root_comments?order=normal&limit=20&offset=0&status=open'
    		#请求api得到json
    		docx=self._init_(url)
    		docx.encoding='utf-8'
    
    		discusses=[]
    		for i in json.loads(docx.text).get('data'):
    			user=i['author']['member']['name']
    			discuss=i['content'].replace('<p>','').replace('</p>','').replace('</br>','').replace('<br>','').replace(',',',').replace('
    ','')
    			disc=re.sub("<.*?>",'','#'+user+'#:'+discuss)
    			discusses.append(disc)
    		return discusses
    
    
    
    
    	#获取所有内容
    	def get_content(self,url):
    		docx=self._init_(url)
    		try:
    			print('链接:',url)
    			docx.raise_for_status()
    			print('连接成功!')
    		except:
    			print('error:连接失败!')
    
    		try:
    			print('正在爬取数据。。。')
    			bs=BeautifulSoup(docx.text,'html.parser')
    			#获取问题
    			question=self.get_content_question(bs)
    			#获取问题回答
    			content=self.get_content_answerContent(bs)
    			#获取作者信息
    			author_data=self.get_content_authorInfo(bs)
    			#获取讨论
    			discusses=self.get_content_discuss(bs)
    			data={
    				'question':question,
    				'url':url,
    				'content':content,
    				'discusses':discusses,
    				'author_data':author_data
    			}
    			print('数据爬取成功!')
    			return data
    		except:
    			print('error:数据爬取失败!')
    		
    
    		
    

      保存类:

    import os
    class Csv():
    	def save(self,data):
    		print('正在保存数据。。。')
    		try:
    			judge=True
    			if os.path.exists(r"save.csv"):
    				judge=False
    			with open('save.csv','a',encoding='utf-8-sig') as f:
    				#写入表头
    				if judge:
    					head=['问题','链接','回答','答主信息','讨论']
    					f.write(','.join(head)+'
    ')
    				#写入第一行
    				row1=[data['question'],data['url'],data['content'],data['author_data'][0],data['discusses'][0]]
    				f.write(','.join(row1)+'
    ')
    				#写入第二行及以后内容
    				row=max(len(data['discusses']),len(data['author_data']))
    				for i in range(1,row):
    					row=['','','','','']
    					row[0]=''
    					row[1]=''
    					row[2]=''
    					try:
    						row[3]=data['author_data'][i]
    					except:
    						row[3]=''
    					try:
    						row[4]=data['discusses'][i]
    					except:
    						row[4]=''
    					f.write(','.join(row)+'
    ')
    				f.write('
    ')
    			print('Save successfully!')
    		except:
    			print('error:保存失败!')
    

      

  • 相关阅读:
    PHPExcel读取excel03/07版到数组
    firefox 自定义快捷键
    phpstorm 自定义函数配置
    解决FPDF报错:FPDF error: Not a JPEG file / FPDF error: Not a PNG file
    mysql学习笔记
    定时备份服务器数据库(借助windows任务计划以及mysqldump)
    discuz 注册用户用到的几个表
    phpstorm配置取消掉63342
    discuz random函数
    discuz X3.2邮箱非必填
  • 原文地址:https://www.cnblogs.com/yizhixiang/p/12077450.html
Copyright © 2011-2022 走看看