将爬取到的数据保存下来,存到Excel表格用xlwt包,存到数据库有多种选择,这里用轻量级的sqlite3,集成到python3包中。
一个文件即可作为数据库连接对象,这里用test.db,直接贴代码。 可以参考廖雪峰大神
import re import sqlite3 import urllib.request import urllib.parse import ssl import bs4 import xlwt ssl._create_default_https_context = ssl.SSLContext def getData(): # 返回25部电影数组 html = getHtml('https://movie.douban.com/top250') bs = bs4.BeautifulSoup(html, 'html.parser') datalist = [] # 存储25部电影 for item in bs.find_all('div', class_='item'): data = [] item = str(item) # 每部电影信息 movielink = re.compile(r'<a href="(.*?)"') # 电影链接正则 moviename = re.compile(r'<span class="title">(.*)</span>') # 电影名正则 link = re.findall(movielink, item)[0] name = re.findall(moviename, item) data.append(link) if len(name) == 2: data.append(name[0]) data.append(name[1].replace('xa0/xa0', '')) else: data.append(name[0]) data.append(' ') datalist.append(data) return datalist def getHtml(baseurl): # 请求url 并返回html页面 # HTTP代理 # proxy_handler = urllib.request.ProxyHandler({ # 'http': '127.0.0.1:1087', # 'https': '127.0.0.1:1087' # }) # opener = urllib.request.build_opener(proxy_handler) # urllib.request.install_opener(opener) req = urllib.request.Request(baseurl) # 请求url req.add_header('User-Agent', # 更改请求方从python到browser 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36') # data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8') # post需提交信息 response = urllib.request.urlopen(req, timeout=3) # 打开url post需提交data html = response.read().decode('utf-8') # 响应体 # print('Status:', response.status, response.reason) # 返回状态码 # print(html) # 返回响应体 # for k, v in response.getheaders(): # 返回响应头 # print('%s: %s' % (k, v)) return html def saveExcel(datalist, savepath): # 将数组写入到 .xls movieExcel = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = movieExcel.add_sheet('Top1-25sheet', cell_overwrite_ok=True) col = ['Rank', 'Link', 'Chinese Name', 'Original Name'] for i in range(0, 4): # 首行属性名 sheet.write(0, i, col[i]) for i in range(0, 25): sheet.write(i + 1, 0, i + 1) # 排名 for j in range(0, 3): # 电影信息 sheet.write(i + 1, j + 1, datalist[i][j]) movieExcel.save(savepath) def saveDB(): # 将数组存到sqlite数据库中 conn = sqlite3.connect('test.db') cursor = conn.cursor() # cursor.execute('create table movie (Rank varchar(10), ' # 'Link varchar(100), ' # 'ChineseName varchar(50), ' # 'OriginalName varchar(50));') datalist = getData() for i in range(0, 25): cursor.execute("insert into movie values(?, ?, ?, ?);", (i+1, datalist[i][0], datalist[i][1], datalist[i][2])) cursor.execute("select * from movie") movie = cursor.fetchall() print(movie) # 测试 cursor.close() conn.commit() conn.close() def main(): saveDB() if __name__ == '__main__': main()