zoukankan      html  css  js  c++  java
  • 爬虫

    
    '''
    对崔庆才的个人博客上的文章基本信息的爬取 (共41页)
    https://cuiqingcai.com/page/1
    标题、链接、浏览的数目、评论的数目以及喜欢的人数
    '''
    import re
    import requests
    import logging
    from lxml import etree
    import pymysql
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    class DBconnect(object):
        def __init__(self):
            self.conn = pymysql.connect(host='localhost', port=3306, user='', password='',db='spider')
            self.cursor = self.conn.cursor()
    
        def save(self, table, data):
            '''判断数据是列表还是字典'''
            print('数据类型',type(data))
            try:
                if isinstance(data,dict):
                    sql = "insert ignore into " + table + str(tuple(data.keys())) + 'values' + str(tuple(data.values())) + ";"
                    self.cursor.execute(sql, data)
                elif isinstance(data,list):
                    for d in data:
                        sql = "insert ignore into " + table + ' values' + str(tuple(d.values())) + ";"
                        print(sql)
                        self.cursor.execute(sql)
                self.conn.commit()
            except Exception as e:
                logging.error(e)
                self.conn.rollback()
    
    class BlogSpider():
        def __init__(self):
            self.base_url = 'https://cuiqingcai.com/page/'
            self.total_page = 41
    
        def parse_url(self,url):
            res = requests.get(url,verify=False)
            return res.text
    
        def parse_content(self,html):
            tree = etree.HTML(html)
            articles = tree.xpath("//div[@class='content']/article")
            data_list = []
            for article in articles:
                category = article.xpath("./header/a/text()")
                category = category[0] if category else None
                title =article.xpath("./header/h2/a/text()")[0] if article.xpath("./header/h2/a/text()") else None
                synopsis = article.xpath("./span/text()")[0]
                picture = article.xpath("./div/a/img/@src")[0]
                author = article.xpath('./p/span[1]/a/text()')[0]
                publish_time = article.xpath("./p/span[2]/text()")[0]
                page_view = article.xpath("./p/span[3]/text()")[0]
                page_view = int(re.findall('d+',page_view)[0])
                comment = article.xpath("./p/span[4]/a/text()")[0]
                comment = int(re.findall('d+',comment)[0])
                likes = article.xpath("./p/span[5]/a/span/text()")[0]
                # data_dic = {'category':category,'title':title,'synopsis':synopsis,'picture':picture,'author':author,'publish_time':publish_time,
                #             'page_view':page_view,'comments':comments,'likes':likes}
                data_dic = {'title': title,'author': author, 'publish_time': publish_time,
                            'page_view': page_view, 'comment': comment}
                data_list.append(data_dic)
            return data_list
    
        def save_data(self,data_list):
            db = DBconnect()
            # 先创建表
            table_name = 'blogs'
            sql = """
            create table if not exists blogs(
            title varchar(100) not null,
            author varchar(30) not null,
            publish_time varchar(30) not null,
            page_view int(6) not null,
            comment int(6) not null
            );
            """
            sql2 = "alter table blogs add unique key(publish_time);"
            db.cursor.execute(sql)
            db.cursor.execute(sql2)
            # db.cursor.execute(sql)
            # 保存数据到数据库
            db.save(table='blogs',data = data_list)
    
        def run(self):
            for i in range(1,41):
                url = self.base_url + str(i)
                # 请求
                str_html = self.parse_url(url)
                # 解析网页
                data_list = self.parse_content(str_html)
                print(data_list)
                # 存储数据
                self.save_data(data_list)
            return {'status_code':'200'}
    
    if __name__ == '__main__':
        bs = BlogSpider()
        bs.run()
    
  • 相关阅读:
    CRoss IndustryStandard Process- for Data Mining 跨行业数据挖掘标准流程(中)
    CRoss IndustryStandard Process- for Data Mining 跨行业数据挖掘标准流程(上)
    window下安装pip工具,再利用pip安装工具来安装其他的python包
    采用ubuntu系统来安装tensorflow
    eclipse中添加python开发环境
    分类器的评价指标
    HBase的基本架构及其原理介绍
    快速排序的递归方式和非递归方式
    DeepCTR专题:DeepFM论文学习和实现及感悟
    DeepCTR专题:Neural Factorization Machines 论文学习和实现及感悟
  • 原文地址:https://www.cnblogs.com/Afrafre/p/11739115.html
Copyright © 2011-2022 走看看