zoukankan      html  css  js  c++  java
  • 爬取糗事百科中的数据

    将爬取出来的数据存入mongodb中

    import pymongo
    import requests
    from lxml import etree
    
    
    class QiushiSpider:
        def __init__(self):
            self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"}
            # 连接对象
            self.conn = pymongo.MongoClient("localhost", 27017)
            # 库对象
            self.db = self.conn["Qiushi"]
            # 集合对象
            self.myset = self.db["qiushiinfo"]
    
        def getPage(self, url):
            res = requests.get(url, headers=self.headers)
            res.encoding = "utf-8"
            html = res.text
            print(res.status_code)
            self.parsePage(html)
    
        def parsePage(self, html):
            parseHtml = etree.HTML(html)
            base_list = parseHtml.xpath('//div[contains(@id,"qiushi_tag_")]')
            print(base_list)
            for base in base_list:
                name = base.xpath('./div/a/h2/text()')
                if not name:
                     name[0] = "匿名用户"
                content = base.xpath('./a/div/span')
                laughNum = base.xpath("./div/span[1]/i")
                pingNum = base.xpath("./div/span[2]/a/i")
                d = {
                    "name": name[0].strip(),
                    "content": content[0].text.strip("
    "),
                    "laughNum": laughNum[0].text,
                    "pingNum": pingNum[0].text,
                }
                self.myset.insert(d)
                print("成功")
    
    
    if __name__ == '__main__':
        spider = QiushiSpider()
        url = "https://www.qiushibaike.com/8hr/page/1/"
        spider.getPage(url)
  • 相关阅读:
    jquery validate --转载
    领域驱动设计之领域模型--转载
    为system对象添加扩展方法
    DDD开源框架
    浅谈命令查询职责分离(CQRS)模式---转载
    AutoMapper小结
    执行后台任务的利器——Hangfire
    单元测试框架
    内存中的堆和栈
    ++*p,(*p)++,*p++与*++p四者的区别
  • 原文地址:https://www.cnblogs.com/zengsf/p/10028665.html
Copyright © 2011-2022 走看看