还是以猫眼电影为例,这次用pyquery库进行爬取
1.简单demo,看看如何使用pyquery提取信息,并将提取到的数据进行组合
# coding: utf-8 # author: hmk import requests from pyquery import PyQuery as pq url = 'http://maoyan.com/board/4' header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "maoyan.com", "Referer": "http://maoyan.com/board", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"} r = requests.get(url, headers=header) r.encoding = r.apparent_encoding html = r.text print(type(html)) doc = pq(html) # print((doc('dd').find('.board-index'))) # print(doc('.name').text()) # print(doc('.releasetime').text()) # print(doc('dd').find('.integer').text()+doc('.fraction').text()) list = [] for t in doc('dd'): index = pq(t).find('.board-index').text() print(index) movie = pq(t).find('.name').text() print(movie) time = pq(t).find('.releasetime').text() print(time) score = pq(t).find('.integer').text() + pq(t).find('.fraction').text() print(score) list.append([index, movie, time, score]) print(list)
2.正式代码
# coding: utf-8 # author: hmk import requests from pyquery import PyQuery as pq import pymysql.cursors def get_html(url, header): try: r = requests.get(url=url, headers=header) r.encoding = r.apparent_encoding return r.text except: return None def get_data(html, list_data): doc = pq(html) for t in doc('dd'): index = pq(t).find('.board-index').text() print(index) movie = pq(t).find('.name').text() print(movie) time = pq(t).find('.releasetime').text() print(time) score = pq(t).find('.integer').text() + pq(t).find('.fraction').text() print(score) list_data.append([index, movie, time, score]) def write_sql(data): conn = pymysql.connect(host='localhost', user='root', password='123456', db='test', charset='utf8') cur = conn.cursor() for i in data: """这里的data参数是指正则匹配并处理后的列表数据(是一个大列表,包含所有电影信息,每个电影信息都存在各自的一个列表中; 对大列表进行迭代,提取每组电影信息,这样提取到的每组电影信息都是一个小列表,然后就可以把每组电影信息写入数据库了)""" movie = i # 每组电影信息,这里可以看做是准备插入数据库的每组电影数据 sql = "insert into maoyan_movie(ranking,movie,release_time,score) values(%s, %s, %s, %s)" # sql插入语句 try: cur.execute(sql, movie) # 执行sql语句,movie即是指要插入数据库的数据 conn.commit() # 插入完成后,不要忘记提交操作 print('导入成功') except: print('导入失败') cur.close() # 关闭游标 conn.close() # 关闭连接 def main(): start_url = 'http://maoyan.com/board/4' depth = 10 # 爬取深度(翻页) header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "maoyan.com", "Referer": "http://maoyan.com/board", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"} for i in range(depth): url = start_url + '?offset=' + str(10 * i) html = get_html(url, header) list_data = [] get_data(html, list_data) write_sql(list_data) # print(list_data) if __name__ == "__main__": main()
其实就这个例子来说,使用pyquery来提取信息是最简单省事的了,直接使用css选择器就可以把想要的数据拿到