将数据存储到CSV文件
import urllib.request
import re
import csv
url = 'https://maoyan.com/board/4?offset=10'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 获取页面
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# 解析页面
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
rList = p.findall(html)
# 存储到CSV
for r in rList:
r = [r[0].strip(),r[1].strip(),r[2].strip()]
with open('my1.csv','a',newline="") as f:
# 创建写入对象
writer = csv.writer(f)
writer.writerow(r)
将数据存储到mysql中
import urllib.request
import re
import pymysql
import warnings
warnings.filterwarnings("ignore")
url = 'https://maoyan.com/board/4?offset=10'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 创建数据库连接对象
db = pymysql.connect(
"localhost",
"root",
"123456",
"spiderdb",
charset="utf8")
# 游标对象
cursor = db.cursor()
# 获取页面
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# 解析页面
p = re.compile(
'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S)
rList = p.findall(html)
# 存储到CSV
ins = 'insert into film(
name,star,releasetime)
values(%s,%s,%s)'
for r in rList:
L = [r[0].strip(),
r[1].strip(),
r[2].strip()[5:15]
]
cursor.execute(ins,L)
db.commit()
将数据存储到pymongo中
import urllib.request
import re
import pymongo
url = 'https://maoyan.com/board/4?offset=10'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 创建连接对象
conn = pymongo.MongoClient("127.0.0.1", 27017)
db = conn["spiderdb"]
myset = db["film"]
# 获取页面
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# 解析页面
p = re.compile(
'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S)
rList = p.findall(html)
# 存储到pymongo
for r in rList:
d = {
"name": r[0].strip(),
"star": r[1].strip(),
"releasetime": r[2].strip()
}
myset.insert_one(d)