zoukankan      html  css  js  c++  java
  • 爬取贴吧的例子(摘录)

    import requests
    class Spider:
    	def __init__(self,name):
    		self.name=name
    		self.url_temp="https://tieba.baidu.com/f?kw="+name+"&ie=utf-8&pn={}"
    		self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/53
    7.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    	def get_url_list(self):
    		return [self.url_temp.format(i*50) for i in range(3)]
    	
    	def parse_url(self,url):
    		response=requests.get(url,headers=self.headers)
    		return response
    
    	def save_html_str(self,html_str,page_num):
    		file_path="/tmp/tieba/{}吧_第{}页".format(self.name,page_num)
    		with open(file_path,"w",encoding="utf-8") as f:
    			f.write(html_str)
    	
    	def run(self):
    		url_list=self.get_url_list()
    		for url in url_list:
    			html_str=self.parse_url(url).content.decode()
    			page_num=url_list.index(url)+1
    			self.save_html_str(html_str,page_num)
    
    def main():
    	name=input("请输入要爬取的贴吧:")
    	tieba_spider=Spider(name)
    	tieba_spider.run()
    
    
    if __name__ == "__main__":
    	main()
    
  • 相关阅读:
    制作OSGB数据索引
    汤臣一品
    Python 库/模块的安装、查看
    ezdxf包下autocad的开发
    python3.7安装pylint
    航拍全景图补天
    电脑百科
    使用Excel批量提取文件名
    利用爬虫实现网上的图片自动下载
    MarkDown&思维导图
  • 原文地址:https://www.cnblogs.com/Haihong72H/p/13891657.html
Copyright © 2011-2022 走看看