zoukankan      html  css  js  c++  java
  • 爬取百度标题保存到数据库

    import requests
    import re 
    import time
    import pymysql
    class bdspider:
        def __init__(self,tiebaName,pages_Num):
            self.tiebaName = tiebaName
            self.pages_Num = pages_Num
            self.base_url = 'https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
            }
        
        #构建每一页的连接
        def getlink(self):
            url_list = []
            for i in range(self.pages_Num):
                url_list.append(self.base_url.format(i*50))
            #print(url_list)
            return url_list
        
        #获取页面信息
        def get_pagesinfo(self,url):
            response = requests.get(url=url,headers=self.headers)
            #print(response.content.decode('utf-8'))
            
            return self.parse_pageInfo(response.content.decode('utf-8'))
        #解析页面
        def parse_pageInfo(self,html):
            pattern = re.compile('<li class=" j_thread_list clearfix".*?<a rel="noreferrer".*?title="(.*?)".*?</a>',re.S)
            return re.findall(pattern,html)
        
        #运行逻辑
        def run(self):
            url_list = self.getlink()
            for url in url_list:
                time.sleep(1)
                page_Info = self.get_pagesinfo(url)
                print(page_Info)
                self.save_to_mysql(page_Info)
                
        #保存数据
        def save_to_mysql(self,page_Info):
            #链接数据库
            conn = pymysql.connect(host='localhost',user='root',passwd='root123',db='baidu',port=3306)
    
                #游标对象
            cursor = conn.cursor()
    
                #插入数据
            for index in range(0, len(page_Info)):
                tt = page_Info[index]
                cursor.execute("insert into title(title) values('{}')".format(tt))
                conn.commit()
                #关闭游标,关闭连接
            cursor.close()
            conn.close()
    if __name__ == "__main__":
        spider = bdspider("lol",5)
        spider.run()
  • 相关阅读:
    Day 39 管道 、数据共享与地址池
    Day 38 Semaphore ,Event ,队列
    Day37 多进程
    Day 36 网络编程-计算机的发展
    Day 35 验证客户端的合法性+socketserver
    Day 34 黏包
    Day 33 Socket编程.
    Day 32 网络编程
    Day 31 面向对象考试题 第四次考试.
    Day 30 面向对象的考试题
  • 原文地址:https://www.cnblogs.com/luweilehei/p/11347563.html
Copyright © 2011-2022 走看看