zoukankan      html  css  js  c++  java
  • 爬取百度标题保存到数据库

    import requests
    import re 
    import time
    import pymysql
    class bdspider:
        def __init__(self,tiebaName,pages_Num):
            self.tiebaName = tiebaName
            self.pages_Num = pages_Num
            self.base_url = 'https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
            }
        
        #构建每一页的连接
        def getlink(self):
            url_list = []
            for i in range(self.pages_Num):
                url_list.append(self.base_url.format(i*50))
            #print(url_list)
            return url_list
        
        #获取页面信息
        def get_pagesinfo(self,url):
            response = requests.get(url=url,headers=self.headers)
            #print(response.content.decode('utf-8'))
            
            return self.parse_pageInfo(response.content.decode('utf-8'))
        #解析页面
        def parse_pageInfo(self,html):
            pattern = re.compile('<li class=" j_thread_list clearfix".*?<a rel="noreferrer".*?title="(.*?)".*?</a>',re.S)
            return re.findall(pattern,html)
        
        #运行逻辑
        def run(self):
            url_list = self.getlink()
            for url in url_list:
                time.sleep(1)
                page_Info = self.get_pagesinfo(url)
                print(page_Info)
                self.save_to_mysql(page_Info)
                
        #保存数据
        def save_to_mysql(self,page_Info):
            #链接数据库
            conn = pymysql.connect(host='localhost',user='root',passwd='root123',db='baidu',port=3306)
    
                #游标对象
            cursor = conn.cursor()
    
                #插入数据
            for index in range(0, len(page_Info)):
                tt = page_Info[index]
                cursor.execute("insert into title(title) values('{}')".format(tt))
                conn.commit()
                #关闭游标,关闭连接
            cursor.close()
            conn.close()
    if __name__ == "__main__":
        spider = bdspider("lol",5)
        spider.run()
  • 相关阅读:
    表模块模式与事务脚本模式的代码编写
    解决方案下显示的网站名称被追加编号的问题解决方法
    应用层代码
    关于CodeReview(java)(转)
    关于事务的几个概念介绍(转)
    关于JVM的ClassLoader(转)
    svn相关
    .subversion
    linux用户与组的管理(命令加入、手动加入、加入组、用户之间的切换)
    回调函数
  • 原文地址:https://www.cnblogs.com/luweilehei/p/11347563.html
Copyright © 2011-2022 走看看