zoukankan      html  css  js  c++  java
  • xlwt sqlite3基础

    将爬取到的数据保存下来,存到Excel表格用xlwt包,存到数据库有多种选择,这里用轻量级的sqlite3,集成到python3包中。

    一个文件即可作为数据库连接对象,这里用test.db,直接贴代码。 可以参考廖雪峰大神

    import re
    import sqlite3
    import urllib.request
    import urllib.parse
    import ssl
    import bs4
    import xlwt
    
    ssl._create_default_https_context = ssl.SSLContext
    
    
    def getData():  # 返回25部电影数组
        html = getHtml('https://movie.douban.com/top250')
        bs = bs4.BeautifulSoup(html, 'html.parser')
        datalist = []  # 存储25部电影
        for item in bs.find_all('div', class_='item'):
            data = []
            item = str(item)  # 每部电影信息
            movielink = re.compile(r'<a href="(.*?)"')  # 电影链接正则
            moviename = re.compile(r'<span class="title">(.*)</span>')  # 电影名正则
            link = re.findall(movielink, item)[0]
            name = re.findall(moviename, item)
            data.append(link)
            if len(name) == 2:
                data.append(name[0])
                data.append(name[1].replace('xa0/xa0', ''))
            else:
                data.append(name[0])
                data.append(' ')
            datalist.append(data)
        return datalist
    
    
    def getHtml(baseurl):   # 请求url 并返回html页面
        # HTTP代理
        # proxy_handler = urllib.request.ProxyHandler({
        #     'http': '127.0.0.1:1087',
        #     'https': '127.0.0.1:1087'
        # })
        # opener = urllib.request.build_opener(proxy_handler)
        # urllib.request.install_opener(opener)
    
        req = urllib.request.Request(baseurl)  # 请求url
        req.add_header('User-Agent',  # 更改请求方从python到browser
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36')
    
        # data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8')  # post需提交信息
        response = urllib.request.urlopen(req, timeout=3)  # 打开url post需提交data
        html = response.read().decode('utf-8')  # 响应体
        # print('Status:', response.status, response.reason)  # 返回状态码
        # print(html)  # 返回响应体
        # for k, v in response.getheaders():  # 返回响应头
        #     print('%s: %s' % (k, v))
        return html
    
    
    def saveExcel(datalist, savepath):  # 将数组写入到  .xls
        movieExcel = xlwt.Workbook(encoding='utf-8', style_compression=0)
        sheet = movieExcel.add_sheet('Top1-25sheet', cell_overwrite_ok=True)
        col = ['Rank', 'Link', 'Chinese Name', 'Original Name']
        for i in range(0, 4):  # 首行属性名
            sheet.write(0, i, col[i])
        for i in range(0, 25):
            sheet.write(i + 1, 0, i + 1)  # 排名
            for j in range(0, 3):  # 电影信息
                sheet.write(i + 1, j + 1, datalist[i][j])
        movieExcel.save(savepath)
    
    
    def saveDB():   # 将数组存到sqlite数据库中
        conn = sqlite3.connect('test.db')
        cursor = conn.cursor()
        # cursor.execute('create table movie (Rank varchar(10), '
        #                'Link varchar(100), '
        #                'ChineseName varchar(50), '
        #                'OriginalName varchar(50));')
        datalist = getData()
        for i in range(0, 25):
            cursor.execute("insert into movie values(?, ?, ?, ?);",
                           (i+1, datalist[i][0],
                            datalist[i][1], datalist[i][2]))
        cursor.execute("select * from movie")
        movie = cursor.fetchall()
        print(movie)    # 测试
        cursor.close()
        conn.commit()
        conn.close()
    
    
    def main():
        saveDB()
    
    
    if __name__ == '__main__':
        main()

     

  • 相关阅读:
    sscanf 与 ssprintf 用法 (转载--https://www.cnblogs.com/Anker/p/3351168.html)
    PYTHON网络爬虫与信息提取[信息的组织与提取](单元五)
    PYTHON网络爬虫与信息提取[BeautifulSoup](单元四)
    PYTHON网络爬虫与信息提取[网络爬虫协议](单元二)
    Python网络爬虫与信息提取[request库的应用](单元一)
    scikit-learn实现ebay数据分析 的随笔
    machine leanring 笔记 vectorization
    machine learning 笔记 normal equation
    machine leanring 笔记 octave命令笔记
    UVa 1354 天平难题 Mobile Computing
  • 原文地址:https://www.cnblogs.com/faded828x/p/14660953.html
Copyright © 2011-2022 走看看