zoukankan html css js c++ java

python3 爬虫内涵段子

import re
from urllib import request
class Sprder:
    def __init__(self):
        self.page=1
        self.switch=True
    def loadPage(self):
        """"
        下载页面
        """
        url="http://www.neihan8.com/article/list_5_"+str(self.page)+".html"
        user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident / 5.0'
        headers = {'User-Agent': user_agent}
        request1=request.Request(url,headers=headers)
        response=request.urlopen(request1)
        html=response.read().decode("gbk")
        pattern=re.compile(r'<divsclass="f18 mb20">(.*?)</div>', re.S)
        content_list=pattern.findall(html)

        self.dealPage(content_list)

    def dealPage(self,content_list):
        """
        处理每页段子
        """
        for item in content_list:
            item=item.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo;","")
            self.writePage(item)

    def writePage(self,item):
        """
         把段子逐个写入文件
        """
        with open("段子.txt","a") as f:
            f.write(item)
    def startWork(self):
        """
        控制爬虫运行

        """
        while self.switch:
            self.loadPage()
            command=str(input("如果继续按回车（退出输入quit）"))
            if command=="quit":
                self.switch=False

            self.page+=1
if __name__ == '__main__':
        duanziSpider=Sprder()
        # duanziSpider.loadPage()
        duanziSpider.startWork()

查看全文

相关阅读:
light oj 1105 规律
 light oj 1071 dp（吃金币升级版）
light oj 1084 线性dp
light oj 1079 01背包
 light oj 1068 数位dp
light oj 1219 树上贪心
 light oj 1057 状压dp TSP
light oj 1037 状压dp
矩阵快速幂3 k*n铺方格
 矩阵快速幂2 3*n铺方格

原文地址：https://www.cnblogs.com/Bighua123/p/8418968.html