zoukankan      html  css  js  c++  java
  • 内涵段子——脑筋急转弯——spider

    # python 3.7
    from urllib.request import Request,urlopen
    import re,time
    
    class Neihan(object):
        def __init__(self):
            self.header={
                'Host': 'www.neihan8.com',
                'Referer': 'https: // www.neihan8.com / njjzw //',
                'Upgrade - Insecure - Requests': 1,
                'User - Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                "Cookie": 'UM_distinctid=1673e837ae7146-0363c5477e0b8a-424f0928-13c680-1673e837ae9355; CNZZDATA1274349754=965294396-1542939999-%7C1542939999; Hm_lvt_94f4eb93f17efa632a5c8a01b23da410=1542942067; npreuecookieclassrecord=%2C2%2C14%2C1%2C; CNZZDATA5804950=cnzz_eid%3D222162018-1542942068-https%253A%252F%252Fwww.neihan8.com%252F%26ntime%3D1542942068; Hm_lpvt_94f4eb93f17efa632a5c8a01b23da410=1542943190'
            }
            self.static = 'https://www.neihan8.com/njjzw/'
    
        def getPage(self,url,refer=None):
            res = urlopen(Request(url=url,headers=self.header)).read()
            self.parsePage(res.decode(),refer)
    
        def parsePage(self,htmlres,*args):
            patten = 'class="title" title=".*?">(.*?)</a></h3>s+<div class="desc">(.*?)</div>'
            p = re.findall(patten,htmlres)
            self.writePge(p,args)
    
        def writePge(self,p,*args):
    
            with open('11.txt','a+',encoding='utf8') as f:
                print(args)
                for i in p:
                    if args[0][0] is not None:
                        print(args)
                        f.write('问题:'+i[0]+'
    '+args[0][0]+i[1].strip()+'
    ')
                    else:
                        f.write('问题:'+i[0]+'
    '+i[1].strip()+'
    ')
                    f.write('
    ')
    
        def workon(self):
            # 爬取 20 页
            for i in range(1,10):
                if i == 1:
                    url = self.static
                    self.getPage(url, refer='答案:')
                else:
                    url = self.static+'index_%s.html'%i
                    self.getPage(url)
    
                time.sleep(2)
    
    if __name__ == '__main__':
        spider  = Neihan()
        spider.workon()
  • 相关阅读:
    JSP 和Servlet 有有什么关系?
    转发(forward)和重定向(redirect)的区别?
    get和post请求的区别?
    软件的三大类型-单机类型、BS类型、CS类型
    Redis集群搭建
    Tomcat网站上的core和deployer的区别
    spring 事务处理
    mybatis ${}与#{}的区别
    Quartz--Spring 定时任务
    @JsonSerialize @JsonIgnoreProperties @JsonIgnore @JsonFormat
  • 原文地址:https://www.cnblogs.com/Skyda/p/10006672.html
Copyright © 2011-2022 走看看